spack/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch
renjithravindrankannath c673979fee
Bump up the version for ROCm-6.0.0 (#42026)
* Bump up the version for ROCm-6.0.0
* Adding patch files
* Style check failure fix
* Style check fixes
* Style check error fixes
* Patch to remove hipblas client file installation in 6.0
* Patch need to be applied on all 5.7 relases
* 6.0 update for math libs and other packages, new github url etc
* Correct package-audit failures
* Correcting shasum for rocfft patch and limiting patch in rocblas
* Reverting updates in rocprofiler-dev due to ci-gitlab failure
* Fixes for ci-gitlab failure due to disabling hip backward compatibilit
* Adding patch file to Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA
* Use the gcnArchName inplace of gcnArch as gcnArch is deprecated from rocm-6.0.0
* Patches to fix magma and blaspp build error with rocm 6.0.0
* Patch for mfem and arborx for rocm 6.0
* Style check error fix
* Correcting style check errors
* Uodating dependent version
* Update for petsc to build with rocm 6.0
  Need reverting-operator-mixup-fix-for-slate.patch for rocm 6.0
* Reverting the change in url for 2.7.4-rocm-enhanced
* hip-tensor 6.0.0 update
2024-01-22 10:19:28 -08:00

11432 lines
429 KiB
Diff

From 27ae15a459f45f1acfcb1a9b1c8d491d9f731fd4 Mon Sep 17 00:00:00 2001
From: Steve Leung <Steve.Leung@amd.com>
Date: Thu, 4 Jan 2024 16:36:08 -0700
Subject: [PATCH] remove submodule and sync shared files from rocFFT, update
CHANGELOG.md
---
clients/CMakeLists.txt | 15 -
clients/bench/CMakeLists.txt | 4 +-
clients/bench/bench.cpp | 2 +-
clients/hipfft_params.h | 2 +-
clients/tests/CMakeLists.txt | 11 +-
clients/tests/accuracy_test_1D.cpp | 8 +-
clients/tests/accuracy_test_2D.cpp | 8 +-
clients/tests/accuracy_test_3D.cpp | 8 +-
clients/tests/accuracy_test_callback.cpp | 2 +-
clients/tests/gtest_main.cpp | 6 +-
clients/tests/hipfft_accuracy_test.cpp | 11 +-
clients/tests/hipfft_accuracy_test.h | 2 +-
clients/tests/multi_device_test.cpp | 2 +-
cmake/dependencies.cmake | 3 -
library/src/amd_detail/hipfft.cpp | 8 +-
shared/accuracy_test.h | 1949 +++++++++++++
shared/arithmetic.h | 61 +
shared/array_predicate.h | 47 +
shared/array_validator.cpp | 549 ++++
shared/array_validator.h | 31 +
shared/concurrency.h | 41 +
shared/data_gen_device.h | 1303 +++++++++
shared/data_gen_host.h | 881 ++++++
shared/device_properties.h | 74 +
shared/enum_to_string.h | 81 +
shared/environment.h | 97 +
shared/fft_params.h | 3274 ++++++++++++++++++++++
shared/fftw_transform.h | 493 ++++
shared/gpubuf.h | 134 +
shared/hip_object_wrapper.h | 86 +
shared/hostbuf.h | 158 ++
shared/increment.h | 100 +
shared/precision_type.h | 70 +
shared/printbuffer.h | 108 +
shared/ptrdiff.h | 40 +
shared/rocfft_accuracy_test.h | 29 +
shared/rocfft_against_fftw.h | 231 ++
shared/rocfft_complex.h | 346 +++
shared/rocfft_hip.h | 52 +
shared/rocfft_params.h | 585 ++++
shared/test_params.h | 51 +
shared/work_queue.h | 49 +
46 files changed, 10966 insertions(+), 66 deletions(-)
create mode 100644 shared/accuracy_test.h
create mode 100644 shared/arithmetic.h
create mode 100644 shared/array_predicate.h
create mode 100644 shared/array_validator.cpp
create mode 100644 shared/array_validator.h
create mode 100644 shared/concurrency.h
create mode 100644 shared/data_gen_device.h
create mode 100644 shared/data_gen_host.h
create mode 100644 shared/device_properties.h
create mode 100644 shared/enum_to_string.h
create mode 100644 shared/environment.h
create mode 100644 shared/fft_params.h
create mode 100644 shared/fftw_transform.h
create mode 100644 shared/gpubuf.h
create mode 100644 shared/hip_object_wrapper.h
create mode 100644 shared/hostbuf.h
create mode 100644 shared/increment.h
create mode 100644 shared/precision_type.h
create mode 100644 shared/printbuffer.h
create mode 100644 shared/ptrdiff.h
create mode 100644 shared/rocfft_accuracy_test.h
create mode 100644 shared/rocfft_against_fftw.h
create mode 100644 shared/rocfft_complex.h
create mode 100644 shared/rocfft_hip.h
create mode 100644 shared/rocfft_params.h
create mode 100644 shared/test_params.h
create mode 100644 shared/work_queue.h
diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
index 1db0d9c..b99a9e5 100644
--- a/clients/CMakeLists.txt
+++ b/clients/CMakeLists.txt
@@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR
endif()
-if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" )
- message(STATUS "rocFFT submodule update")
- execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT
- RESULT_VARIABLE GIT_SUBMOD_RESULT)
- if( NOT GIT_SUBMOD_RESULT EQUAL "0" )
- message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.")
- endif( )
-endif( )
-
-if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" )
- message(FATAL_ERROR "The rocFFT submodule is not present! Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt")
-endif( )
-
-
# This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on
# all the time
# This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim
diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt
index b5cef9b..ccb8c29 100644
--- a/clients/bench/CMakeLists.txt
+++ b/clients/bench/CMakeLists.txt
@@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED)
set( Boost_USE_STATIC_LIBS OFF )
-set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp )
-set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h )
+set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp )
+set( hipfft_bench_includes bench.h ../../shared/array_validator.h )
add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} )
diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
index 894769c..a906879 100644
--- a/clients/bench/bench.cpp
+++ b/clients/bench/bench.cpp
@@ -29,7 +29,7 @@
#include <boost/program_options.hpp>
namespace po = boost::program_options;
-#include "../rocFFT/shared/gpubuf.h"
+#include "../../shared/gpubuf.h"
int main(int argc, char* argv[])
{
diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
index b8b58ac..75d9db9 100644
--- a/clients/hipfft_params.h
+++ b/clients/hipfft_params.h
@@ -23,9 +23,9 @@
#include <optional>
+#include "../shared/fft_params.h"
#include "hipfft/hipfft.h"
#include "hipfft/hipfftXt.h"
-#include "rocFFT/shared/fft_params.h"
inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val)
{
diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
index 9742a45..2d1aac0 100644
--- a/clients/tests/CMakeLists.txt
+++ b/clients/tests/CMakeLists.txt
@@ -37,14 +37,7 @@ set( hipfft-test_source
accuracy_test_3D.cpp
accuracy_test_callback.cpp
multi_device_test.cpp
- ../rocFFT/shared/array_validator.cpp
- )
-
-set( hipfft-test_includes
- ../rocFFT/clients/tests/fftw_transform.h
- ../rocFFT/clients/tests/rocfft_against_fftw.h
- ../rocFFT/clients/tests/misc/include/test_exception.h
- ../rocFFT/shared/array_validator.h
+ ../../shared/array_validator.cpp
)
add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} )
@@ -56,8 +49,6 @@ target_include_directories(
$<BUILD_INTERFACE:${FFTW_INCLUDE_DIRS}>
$<BUILD_INTERFACE:${hip_INCLUDE_DIRS}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include>
- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/library/include>
- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/clients/tests>
)
diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp
index 27e849d..57d846a 100644
--- a/clients/tests/accuracy_test_1D.cpp
+++ b/clients/tests/accuracy_test_1D.cpp
@@ -23,11 +23,11 @@
#include <stdexcept>
#include <vector>
-#include "../rocFFT/shared/fft_params.h"
+#include "../../shared/fft_params.h"
-#include "accuracy_test.h"
-#include "fftw_transform.h"
-#include "rocfft_against_fftw.h"
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/rocfft_against_fftw.h"
using ::testing::ValuesIn;
diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp
index 1674593..6f618c0 100644
--- a/clients/tests/accuracy_test_2D.cpp
+++ b/clients/tests/accuracy_test_2D.cpp
@@ -23,11 +23,11 @@
#include <stdexcept>
#include <vector>
-#include "../rocFFT/shared/fft_params.h"
+#include "../../shared/fft_params.h"
-#include "accuracy_test.h"
-#include "fftw_transform.h"
-#include "rocfft_against_fftw.h"
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/rocfft_against_fftw.h"
using ::testing::ValuesIn;
diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
index a87476a..941ec24 100644
--- a/clients/tests/accuracy_test_3D.cpp
+++ b/clients/tests/accuracy_test_3D.cpp
@@ -23,11 +23,11 @@
#include <stdexcept>
#include <vector>
-#include "../rocFFT/shared/fft_params.h"
+#include "../../shared/fft_params.h"
-#include "accuracy_test.h"
-#include "fftw_transform.h"
-#include "rocfft_against_fftw.h"
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/rocfft_against_fftw.h"
using ::testing::ValuesIn;
diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp
index 4782830..b5cc4a7 100644
--- a/clients/tests/accuracy_test_callback.cpp
+++ b/clients/tests/accuracy_test_callback.cpp
@@ -18,7 +18,7 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
-#include "accuracy_test.h"
+#include "../../shared/accuracy_test.h"
std::vector<std::vector<size_t>> callback_sizes = {
// some single kernel sizes
diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
index 1f0ae83..2f7674e 100644
--- a/clients/tests/gtest_main.cpp
+++ b/clients/tests/gtest_main.cpp
@@ -30,10 +30,10 @@
#include <streambuf>
#include <string>
+#include "../../shared/concurrency.h"
+#include "../../shared/environment.h"
+#include "../../shared/work_queue.h"
#include "../hipfft_params.h"
-#include "../rocFFT/shared/concurrency.h"
-#include "../rocFFT/shared/environment.h"
-#include "../rocFFT/shared/work_queue.h"
#include "hipfft/hipfft.h"
#include "hipfft_accuracy_test.h"
#include "hipfft_test_params.h"
diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp
index 2abaf74..609239a 100644
--- a/clients/tests/hipfft_accuracy_test.cpp
+++ b/clients/tests/hipfft_accuracy_test.cpp
@@ -29,11 +29,12 @@
#include "hipfft/hipfft.h"
#include "../hipfft_params.h"
-#include "../rocFFT/clients/tests/fftw_transform.h"
-#include "../rocFFT/clients/tests/rocfft_accuracy_test.h"
-#include "../rocFFT/clients/tests/rocfft_against_fftw.h"
-#include "../rocFFT/shared/gpubuf.h"
-#include "../rocFFT/shared/rocfft_complex.h"
+
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/gpubuf.h"
+#include "../../shared/rocfft_against_fftw.h"
+#include "../../shared/rocfft_complex.h"
void fft_vs_reference(hipfft_params& params, bool round_trip)
{
diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h
index 0491bd9..181150e 100644
--- a/clients/tests/hipfft_accuracy_test.h
+++ b/clients/tests/hipfft_accuracy_test.h
@@ -23,8 +23,8 @@
#ifndef ROCFFT_ACCURACY_TEST
#define ROCFFT_ACCURACY_TEST
+#include "../../shared/accuracy_test.h"
#include "../hipfft_params.h"
-#include "../rocFFT/clients/tests/accuracy_test.h"
void fft_vs_reference(hipfft_params& params, bool round_trip = false);
diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
index b3dc4c9..3274b80 100644
--- a/clients/tests/multi_device_test.cpp
+++ b/clients/tests/multi_device_test.cpp
@@ -18,7 +18,7 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
-#include "accuracy_test.h"
+#include "../../shared/accuracy_test.h"
#include <gtest/gtest.h>
#include <hip/hip_runtime_api.h>
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 5810e37..bdbf689 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -21,9 +21,6 @@
#
# #############################################################################
-# Git
-find_package(Git REQUIRED)
-
# HIP
if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
if( NOT BUILD_WITH_LIB STREQUAL "CUDA" )
diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp
index c2f7036..3d4f61f 100644
--- a/library/src/amd_detail/hipfft.cpp
+++ b/library/src/amd_detail/hipfft.cpp
@@ -27,10 +27,10 @@
#include <string>
#include <vector>
-#include "../../../clients/rocFFT/shared/arithmetic.h"
-#include "../../../clients/rocFFT/shared/gpubuf.h"
-#include "../../../clients/rocFFT/shared/ptrdiff.h"
-#include "../../../clients/rocFFT/shared/rocfft_hip.h"
+#include "../../../shared/arithmetic.h"
+#include "../../../shared/gpubuf.h"
+#include "../../../shared/ptrdiff.h"
+#include "../../../shared/rocfft_hip.h"
#define ROC_FFT_CHECK_ALLOC_FAILED(ret) \
{ \
diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h
new file mode 100644
index 0000000..362a7c1
--- /dev/null
+++ b/shared/accuracy_test.h
@@ -0,0 +1,1949 @@
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#ifndef ACCURACY_TEST
+#define ACCURACY_TEST
+
+#include <algorithm>
+#include <functional>
+#include <future>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "enum_to_string.h"
+#include "fft_params.h"
+#include "fftw_transform.h"
+#include "gpubuf.h"
+#include "rocfft_against_fftw.h"
+#include "test_params.h"
+
+extern int verbose;
+extern size_t ramgb;
+extern bool fftw_compare;
+
+static const size_t ONE_GiB = 1 << 30;
+
+inline size_t bytes_to_GiB(const size_t bytes)
+{
+ return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB;
+}
+
+typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type>
+ type_place_io_t;
+
+// Remember the results of the last FFT we computed with FFTW. Tests
+// are ordered so that later cases can often reuse this result.
+struct last_cpu_fft_cache
+{
+ // keys to the cache
+ std::vector<size_t> length;
+ size_t nbatch = 0;
+ fft_transform_type transform_type = fft_transform_type_complex_forward;
+ bool run_callbacks = false;
+ fft_precision precision = fft_precision_single;
+
+ // FFTW input/output
+ std::vector<hostbuf> cpu_input;
+ std::vector<hostbuf> cpu_output;
+};
+extern last_cpu_fft_cache last_cpu_fft_data;
+
+struct system_memory
+{
+ size_t total_bytes = 0;
+ size_t free_bytes = 0;
+};
+extern system_memory start_memory;
+
+system_memory get_system_memory();
+
+// Estimate the amount of host memory needed for buffers.
+inline size_t needed_ram_buffers(const fft_params& params, const int verbose)
+{
+ // This calculation is assuming contiguous data but noncontiguous buffers
+ // are assumed to require a close enough amount of space for the purposes
+ // of this estimate.
+
+ size_t needed_ram = 6
+ * std::accumulate(params.length.begin(),
+ params.length.end(),
+ static_cast<size_t>(1),
+ std::multiplies<size_t>());
+
+ // Account for precision and data type:
+ if(params.transform_type != fft_transform_type_real_forward
+ && params.transform_type != fft_transform_type_real_inverse)
+ {
+ needed_ram *= 2;
+ }
+ switch(params.precision)
+ {
+ case fft_precision_half:
+ needed_ram *= 2;
+ break;
+ case fft_precision_single:
+ needed_ram *= 4;
+ break;
+ case fft_precision_double:
+ needed_ram *= 8;
+ break;
+ }
+
+ needed_ram *= params.nbatch;
+
+ if(verbose)
+ {
+ std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n";
+ }
+
+ return needed_ram;
+}
+
+template <typename Tfloat>
+bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan)
+{
+#ifdef FFTW_HAVE_SPRINT_PLAN
+ char* print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan);
+ std::string print_plan(print_plan_c_str);
+ free(print_plan_c_str);
+ return print_plan.find("bluestein") != std::string::npos;
+#else
+ // assume worst case (bluestein is always used)
+ return true;
+#endif
+}
+
+// Estimate the amount of host memory needed for fftw.
+template <typename Tfloat>
+inline size_t needed_ram_fftw(const fft_params& contiguous_params,
+ const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
+ const int verbose)
+{
+ size_t total_length = std::accumulate(contiguous_params.length.begin(),
+ contiguous_params.length.end(),
+ static_cast<size_t>(1),
+ std::multiplies<size_t>());
+ size_t needed_ram = 0;
+ // Detect Bluestein in plan
+ if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan))
+ {
+ for(size_t dim : contiguous_params.length)
+ {
+ unsigned int needed_ram_dim = dim;
+
+ // Next-plus-one-power-of-two multiplied any other lengths
+ needed_ram_dim--;
+
+ needed_ram_dim |= needed_ram_dim >> 2;
+ needed_ram_dim |= needed_ram_dim >> 4;
+ needed_ram_dim |= needed_ram_dim >> 8;
+ needed_ram_dim |= needed_ram_dim >> 16;
+
+ needed_ram_dim++;
+
+ needed_ram_dim *= 2 * (total_length / dim);
+
+ if(needed_ram_dim > needed_ram)
+ {
+ needed_ram = needed_ram_dim;
+ }
+ }
+ }
+
+ // Account for precision and data type:
+ if(contiguous_params.transform_type != fft_transform_type_real_forward
+ && contiguous_params.transform_type != fft_transform_type_real_inverse)
+ {
+ needed_ram *= 2;
+ }
+ switch(contiguous_params.precision)
+ {
+ case fft_precision_half:
+ needed_ram *= 2;
+ break;
+ case fft_precision_single:
+ needed_ram *= 4;
+ break;
+ case fft_precision_double:
+ needed_ram *= 8;
+ break;
+ }
+
+ needed_ram *= contiguous_params.nbatch;
+
+ if(verbose)
+ {
+ std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n";
+ }
+
+ return needed_ram;
+}
+
+// Base gtest class for comparison with FFTW.
+class accuracy_test : public ::testing::TestWithParam<fft_params>
+{
+protected:
+ void SetUp() override {}
+ void TearDown() override {}
+
+public:
+ static std::string TestName(const testing::TestParamInfo<accuracy_test::ParamType>& info)
+ {
+ return info.param.token();
+ }
+};
+
+const static std::vector<size_t> batch_range = {2, 1};
+
+const static std::vector<fft_precision> precision_range_full
+ = {fft_precision_double, fft_precision_single, fft_precision_half};
+const static std::vector<fft_precision> precision_range_sp_dp
+ = {fft_precision_double, fft_precision_single};
+
+const static std::vector<fft_result_placement> place_range
+ = {fft_placement_inplace, fft_placement_notinplace};
+const static std::vector<fft_transform_type> trans_type_range
+ = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
+const static std::vector<fft_transform_type> trans_type_range_complex
+ = {fft_transform_type_complex_forward};
+const static std::vector<fft_transform_type> trans_type_range_real
+ = {fft_transform_type_real_forward};
+
+// Given a vector of vector of lengths, generate all unique permutations.
+// Add an optional vector of ad-hoc lengths to the result.
+inline std::vector<std::vector<size_t>>
+ generate_lengths(const std::vector<std::vector<size_t>>& inlengths)
+{
+ std::vector<std::vector<size_t>> output;
+ if(inlengths.size() == 0)
+ {
+ return output;
+ }
+ const size_t dim = inlengths.size();
+ std::vector<size_t> looplength(dim);
+ for(unsigned int i = 0; i < dim; ++i)
+ {
+ looplength[i] = inlengths[i].size();
+ }
+ for(unsigned int idx = 0; idx < inlengths.size(); ++idx)
+ {
+ std::vector<size_t> index(dim);
+ do
+ {
+ std::vector<size_t> length(dim);
+ for(unsigned int i = 0; i < dim; ++i)
+ {
+ length[i] = inlengths[i][index[i]];
+ }
+ output.push_back(length);
+ } while(increment_rowmajor(index, looplength));
+ }
+ // uniquify the result
+ std::sort(output.begin(), output.end());
+ output.erase(std::unique(output.begin(), output.end()), output.end());
+ return output;
+}
+
+// Return the valid rocFFT input and output types for a given transform type.
+inline std::vector<std::pair<fft_array_type, fft_array_type>>
+ iotypes(const fft_transform_type transformType,
+ const fft_result_placement place,
+ const bool planar = true)
+{
+ std::vector<std::pair<fft_array_type, fft_array_type>> iotypes;
+ switch(transformType)
+ {
+ case fft_transform_type_complex_forward:
+ case fft_transform_type_complex_inverse:
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_complex_interleaved, fft_array_type_complex_interleaved));
+ if(planar)
+ {
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_complex_planar, fft_array_type_complex_planar));
+ if(place == fft_placement_notinplace)
+ {
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_complex_planar, fft_array_type_complex_interleaved));
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_complex_interleaved, fft_array_type_complex_planar));
+ }
+ }
+ break;
+ case fft_transform_type_real_forward:
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_real, fft_array_type_hermitian_interleaved));
+ if(planar && place == fft_placement_notinplace)
+ {
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_real, fft_array_type_hermitian_planar));
+ }
+ break;
+ case fft_transform_type_real_inverse:
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_hermitian_interleaved, fft_array_type_real));
+ if(planar && place == fft_placement_notinplace)
+ {
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+ fft_array_type_hermitian_planar, fft_array_type_real));
+ }
+ break;
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+ return iotypes;
+}
+
+// Generate all combinations of input/output types, from combinations of transform and placement
+// types.
+static std::vector<type_place_io_t>
+ generate_types(fft_transform_type transform_type,
+ const std::vector<fft_result_placement>& place_range,
+ const bool planar)
+{
+ std::vector<type_place_io_t> ret;
+ for(auto place : place_range)
+ {
+ for(auto iotype : iotypes(transform_type, place, planar))
+ {
+ ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second));
+ }
+ }
+ return ret;
+}
+
+struct stride_generator
+{
+ struct stride_dist
+ {
+ stride_dist(const std::vector<size_t>& s, size_t d)
+ : stride(s)
+ , dist(d)
+ {
+ }
+ std::vector<size_t> stride;
+ size_t dist;
+ };
+
+ // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer
+ //
+ // cppcheck-suppress noExplicitConstructor
+ stride_generator(const std::vector<std::vector<size_t>>& stride_list_in)
+ : stride_list(stride_list_in)
+ {
+ }
+ virtual std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
+ size_t batch) const
+ {
+ std::vector<stride_dist> ret;
+ for(const auto& s : stride_list)
+ ret.emplace_back(s, 0);
+ return ret;
+ }
+ std::vector<std::vector<size_t>> stride_list;
+};
+
+// Generate strides such that batch is essentially the innermost dimension
+// e.g. given a batch-2 4x3x2 transform which logically looks like:
+//
+// batch0:
+// A B A B
+// A B A B
+// A B A B
+//
+// A B A B
+// A B A B
+// A B A B
+//
+// batch1:
+// A B A B
+// A B A B
+// A B A B
+//
+// A B A B
+// A B A B
+// A B A B
+//
+// we instead do stride-2 4x3x2 transform where first batch is the
+// A's and second batch is the B's.
+struct stride_generator_3D_inner_batch : public stride_generator
+{
+ explicit stride_generator_3D_inner_batch(const std::vector<std::vector<size_t>>& stride_list_in)
+ : stride_generator(stride_list_in)
+ {
+ }
+ std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
+ size_t batch) const override
+ {
+ std::vector<stride_dist> ret = stride_generator::generate(lengths, batch);
+ std::vector<size_t> strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch};
+ ret.emplace_back(strides, 1);
+ return ret;
+ }
+};
+
+// Create an array of parameters to pass to gtest. Base generator
+// that allows choosing transform type.
+inline auto param_generator_base(const std::vector<fft_transform_type>& type_range,
+ const std::vector<std::vector<size_t>>& v_lengths,
+ const std::vector<fft_precision>& precision_range,
+ const std::vector<size_t>& batch_range,
+ decltype(generate_types) types_generator,
+ const stride_generator& istride,
+ const stride_generator& ostride,
+ const std::vector<std::vector<size_t>>& ioffset_range,
+ const std::vector<std::vector<size_t>>& ooffset_range,
+ const std::vector<fft_result_placement>& place_range,
+ const bool planar = true,
+ const bool run_callbacks = false)
+{
+
+ std::vector<fft_params> params;
+
+ // For any length, we compute double-precision CPU reference
+ // for largest batch size first and reuse for smaller batch
+ // sizes, then convert to single-precision.
+
+ for(auto& transform_type : type_range)
+ {
+ for(const auto& lengths : v_lengths)
+ {
+ // try to ensure that we are given literal lengths, not
+ // something to be passed to generate_lengths
+ if(lengths.empty() || lengths.size() > 3)
+ {
+ continue;
+ }
+ {
+ for(const auto precision : precision_range)
+ {
+ for(const auto batch : batch_range)
+ {
+ for(const auto& types :
+ types_generator(transform_type, place_range, planar))
+ {
+ for(const auto& istride_dist : istride.generate(lengths, batch))
+ {
+ for(const auto& ostride_dist : ostride.generate(lengths, batch))
+ {
+ for(const auto& ioffset : ioffset_range)
+ {
+ for(const auto& ooffset : ooffset_range)
+ {
+ fft_params param;
+
+ param.length = lengths;
+ param.istride = istride_dist.stride;
+ param.ostride = ostride_dist.stride;
+ param.nbatch = batch;
+ param.precision = precision;
+ param.transform_type = std::get<0>(types);
+ param.placement = std::get<1>(types);
+ param.idist = istride_dist.dist;
+ param.odist = ostride_dist.dist;
+ param.itype = std::get<2>(types);
+ param.otype = std::get<3>(types);
+ param.ioffset = ioffset;
+ param.ooffset = ooffset;
+
+ if(run_callbacks)
+ {
+ // add a test if both input and output support callbacks
+ if(param.itype != fft_array_type_complex_planar
+ && param.itype != fft_array_type_hermitian_planar
+ && param.otype != fft_array_type_complex_planar
+ && param.otype
+ != fft_array_type_hermitian_planar)
+ {
+ param.run_callbacks = true;
+ }
+ else
+ {
+ continue;
+ }
+ }
+ param.validate();
+
+ // Keeping the random number generator here
+ // allows one to run the same tests for a given
+ // random seed; ie the test suite is repeatable.
+ std::hash<std::string> hasher;
+ std::ranlux24_base gen(random_seed
+ + hasher(param.token()));
+ std::uniform_real_distribution<> dis(0.0, 1.0);
+
+ if(param.is_planar())
+ {
+ const double roll = dis(gen);
+ if(roll > planar_prob)
+ {
+ if(verbose > 4)
+ {
+ std::cout << "Planar transform skipped "
+ "(planar_prob: "
+ << planar_prob << " > " << roll
+ << ")\n";
+ }
+ continue;
+ }
+ }
+ if(run_callbacks)
+ {
+ const double roll = dis(gen);
+ if(roll > callback_prob)
+ {
+
+ if(verbose > 4)
+ {
+ std::cout << "Callback transform skipped "
+ "(planar_prob: "
+ << planar_prob << " > " << roll
+ << ")\n";
+ }
+ continue;
+ }
+ }
+
+ if(param.valid(0))
+ {
+ params.push_back(param);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return params;
+}
+
+// Create an array of parameters to pass to gtest. Default generator
+// that picks all transform types.
+inline auto param_generator(const std::vector<std::vector<size_t>>& v_lengths,
+ const std::vector<fft_precision>& precision_range,
+ const std::vector<size_t>& batch_range,
+ const stride_generator& istride,
+ const stride_generator& ostride,
+ const std::vector<std::vector<size_t>>& ioffset_range,
+ const std::vector<std::vector<size_t>>& ooffset_range,
+ const std::vector<fft_result_placement>& place_range,
+ const bool planar,
+ const bool run_callbacks = false)
+{
+ return param_generator_base(trans_type_range,
+ v_lengths,
+ precision_range,
+ batch_range,
+ generate_types,
+ istride,
+ ostride,
+ ioffset_range,
+ ooffset_range,
+ place_range,
+ planar,
+ run_callbacks);
+}
+
+// Create an array of parameters to pass to gtest. Only tests complex-type transforms
+inline auto param_generator_complex(const std::vector<std::vector<size_t>>& v_lengths,
+ const std::vector<fft_precision>& precision_range,
+ const std::vector<size_t>& batch_range,
+ const stride_generator& istride,
+ const stride_generator& ostride,
+ const std::vector<std::vector<size_t>>& ioffset_range,
+ const std::vector<std::vector<size_t>>& ooffset_range,
+ const std::vector<fft_result_placement>& place_range,
+ const bool planar,
+ const bool run_callbacks = false)
+{
+ return param_generator_base(trans_type_range_complex,
+ v_lengths,
+ precision_range,
+ batch_range,
+ generate_types,
+ istride,
+ ostride,
+ ioffset_range,
+ ooffset_range,
+ place_range,
+ planar,
+ run_callbacks);
+}
+
+// Create an array of parameters to pass to gtest.
+inline auto param_generator_real(const std::vector<std::vector<size_t>>& v_lengths,
+ const std::vector<fft_precision>& precision_range,
+ const std::vector<size_t>& batch_range,
+ const stride_generator& istride,
+ const stride_generator& ostride,
+ const std::vector<std::vector<size_t>>& ioffset_range,
+ const std::vector<std::vector<size_t>>& ooffset_range,
+ const std::vector<fft_result_placement>& place_range,
+ const bool planar,
+ const bool run_callbacks = false)
+{
+ return param_generator_base(trans_type_range_real,
+ v_lengths,
+ precision_range,
+ batch_range,
+ generate_types,
+ istride,
+ ostride,
+ ioffset_range,
+ ooffset_range,
+ place_range,
+ planar,
+ run_callbacks);
+}
+
+template <class Tcontainer>
+auto param_generator_token(const Tcontainer& tokens)
+{
+ std::vector<fft_params> params;
+ params.reserve(tokens.size());
+ for(auto t : tokens)
+ {
+ params.push_back({});
+ params.back().from_token(t);
+ }
+ return params;
+}
+
+struct callback_test_data
+{
+ // scalar to modify the input/output with
+ double scalar;
+ // base address of input, to ensure that each callback gets an offset from that base
+ void* base;
+};
+
+void* get_load_callback_host(fft_array_type itype,
+ fft_precision precision,
+ bool round_trip_inverse);
+void apply_load_callback(const fft_params& params, std::vector<hostbuf>& input);
+void apply_store_callback(const fft_params& params, std::vector<hostbuf>& output);
+void* get_store_callback_host(fft_array_type otype,
+ fft_precision precision,
+ bool round_trip_inverse);
+
+static auto allocate_cpu_fft_buffer(const fft_precision precision,
+ const fft_array_type type,
+ const std::vector<size_t>& size)
+{
+ // FFTW does not support half-precision, so we do single instead.
+ // So if we need to do a half-precision FFTW transform, allocate
+ // enough buffer for single-precision instead.
+ return allocate_host_buffer(
+ precision == fft_precision_half ? fft_precision_single : precision, type, size);
+}
+
+template <typename Tfloat>
+inline void execute_cpu_fft(fft_params& params,
+ fft_params& contiguous_params,
+ typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
+ std::vector<hostbuf>& cpu_input,
+ std::vector<hostbuf>& cpu_output)
+{
+ // CPU output might not be allocated already for us, if FFTW never
+ // needed an output buffer during planning
+ if(cpu_output.empty())
+ cpu_output = allocate_cpu_fft_buffer(
+ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
+
+ // If this is either C2R or callbacks are enabled, the
+ // input will be modified. So we need to modify the copy instead.
+ std::vector<hostbuf> cpu_input_copy(cpu_input.size());
+ std::vector<hostbuf>* input_ptr = &cpu_input;
+ if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse)
+ {
+ for(size_t i = 0; i < cpu_input.size(); ++i)
+ {
+ cpu_input_copy[i] = cpu_input[i].copy();
+ }
+
+ input_ptr = &cpu_input_copy;
+ }
+
+ // run FFTW (which may destroy CPU input)
+ apply_load_callback(params, *input_ptr);
+ fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output);
+ // clean up
+ fftw_destroy_plan_type(cpu_plan);
+ // ask FFTW to fully clean up, since it tries to cache plan details
+ fftw_cleanup();
+ cpu_plan = nullptr;
+ apply_store_callback(params, cpu_output);
+}
+
+// execute the GPU transform
+template <class Tparams>
+inline void execute_gpu_fft(Tparams& params,
+ std::vector<void*>& pibuffer,
+ std::vector<void*>& pobuffer,
+ std::vector<gpubuf>& obuffer,
+ std::vector<hostbuf>& gpu_output,
+ bool round_trip_inverse = false)
+{
+ gpubuf_t<callback_test_data> load_cb_data_dev;
+ gpubuf_t<callback_test_data> store_cb_data_dev;
+ if(params.run_callbacks)
+ {
+ void* load_cb_host
+ = get_load_callback_host(params.itype, params.precision, round_trip_inverse);
+
+ callback_test_data load_cb_data_host;
+
+ if(round_trip_inverse)
+ {
+ load_cb_data_host.scalar = params.store_cb_scalar;
+ }
+ else
+ {
+ load_cb_data_host.scalar = params.load_cb_scalar;
+ }
+
+ load_cb_data_host.base = pibuffer.front();
+
+ auto hip_status = hipSuccess;
+
+ hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data));
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP();
+ }
+ else
+ {
+ GTEST_FAIL();
+ }
+ }
+ hip_status = hipMemcpy(load_cb_data_dev.data(),
+ &load_cb_data_host,
+ sizeof(callback_test_data),
+ hipMemcpyHostToDevice);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP();
+ }
+ else
+ {
+ GTEST_FAIL();
+ }
+ }
+
+ void* store_cb_host
+ = get_store_callback_host(params.otype, params.precision, round_trip_inverse);
+
+ callback_test_data store_cb_data_host;
+
+ if(round_trip_inverse)
+ {
+ store_cb_data_host.scalar = params.load_cb_scalar;
+ }
+ else
+ {
+ store_cb_data_host.scalar = params.store_cb_scalar;
+ }
+
+ store_cb_data_host.base = pobuffer.front();
+
+ hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data));
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP();
+ }
+ else
+ {
+ GTEST_FAIL();
+ }
+ }
+
+ hip_status = hipMemcpy(store_cb_data_dev.data(),
+ &store_cb_data_host,
+ sizeof(callback_test_data),
+ hipMemcpyHostToDevice);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP();
+ }
+ else
+ {
+ GTEST_FAIL();
+ }
+ }
+
+ auto fft_status = params.set_callbacks(
+ load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data());
+ if(fft_status != fft_status_success)
+ throw std::runtime_error("set callback failure");
+ }
+
+ // Execute the transform:
+ auto fft_status = params.execute(pibuffer.data(), pobuffer.data());
+ if(fft_status != fft_status_success)
+ throw std::runtime_error("rocFFT plan execution failure");
+
+ // if not comparing, then just executing the GPU FFT is all we
+ // need to do
+ if(!fftw_compare)
+ return;
+
+ // finalize a multi-GPU transform
+ params.multi_gpu_finalize(obuffer, pobuffer);
+
+ ASSERT_TRUE(!gpu_output.empty()) << "no output buffers";
+ for(unsigned int idx = 0; idx < gpu_output.size(); ++idx)
+ {
+ ASSERT_TRUE(gpu_output[idx].data() != nullptr)
+ << "output buffer index " << idx << " is empty";
+ auto hip_status = hipMemcpy(gpu_output[idx].data(),
+ pobuffer.at(idx),
+ gpu_output[idx].size(),
+ hipMemcpyDeviceToHost);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << "hipMemcpy failure";
+ }
+ else
+ {
+ GTEST_FAIL() << "hipMemcpy failure";
+ }
+ }
+ }
+ if(verbose > 2)
+ {
+ std::cout << "GPU output:\n";
+ params.print_obuffer(gpu_output);
+ }
+ if(verbose > 5)
+ {
+ std::cout << "flat GPU output:\n";
+ params.print_obuffer_flat(gpu_output);
+ }
+}
+
+template <typename Tfloat>
+static void assert_init_value(const std::vector<hostbuf>& output,
+ const size_t idx,
+ const Tfloat orig_value);
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value)
+{
+ float actual_value = reinterpret_cast<const float*>(output.front().data())[idx];
+ ASSERT_EQ(actual_value, orig_value) << "index " << idx;
+}
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output,
+ const size_t idx,
+ const double orig_value)
+{
+ double actual_value = reinterpret_cast<const double*>(output.front().data())[idx];
+ ASSERT_EQ(actual_value, orig_value) << "index " << idx;
+}
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output,
+ const size_t idx,
+ const rocfft_complex<float> orig_value)
+{
+ // if this is interleaved, check directly
+ if(output.size() == 1)
+ {
+ rocfft_complex<float> actual_value
+ = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx];
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+ }
+ else
+ {
+ // planar
+ rocfft_complex<float> actual_value{
+ reinterpret_cast<const float*>(output.front().data())[idx],
+ reinterpret_cast<const float*>(output.back().data())[idx]};
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+ }
+}
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output,
+ const size_t idx,
+ const rocfft_complex<double> orig_value)
+{
+ // if this is interleaved, check directly
+ if(output.size() == 1)
+ {
+ rocfft_complex<double> actual_value
+ = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx];
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+ }
+ else
+ {
+ // planar
+ rocfft_complex<double> actual_value{
+ reinterpret_cast<const double*>(output.front().data())[idx],
+ reinterpret_cast<const double*>(output.back().data())[idx]};
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+ }
+}
+
+static const int OUTPUT_INIT_PATTERN = 0xcd;
+template <class Tfloat>
+void check_single_output_stride(const std::vector<hostbuf>& output,
+ const size_t offset,
+ const std::vector<size_t>& length,
+ const std::vector<size_t>& stride,
+ const size_t i)
+{
+ Tfloat orig;
+ memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat));
+
+ size_t curLength = length[i];
+ size_t curStride = stride[i];
+ size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1];
+ size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1];
+
+ if(nextSmallerLength == 0)
+ {
+ // this is the fastest dim, indexes that are not multiples of
+ // the stride should be the initial value
+ for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx)
+ {
+ if(idx % curStride != 0)
+ assert_init_value<Tfloat>(output, idx, orig);
+ }
+ }
+ else
+ {
+ for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx)
+ {
+ // check that the space after the next smaller dim and the
+ // end of this dim is initial value
+ for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx)
+ assert_init_value<Tfloat>(output, idx, orig);
+
+ check_single_output_stride<Tfloat>(
+ output, offset + lengthIdx * curStride, length, stride, i + 1);
+ }
+ }
+}
+
+template <class Tparams>
+void check_output_strides(const std::vector<hostbuf>& output, Tparams& params)
+{
+ // treat batch+dist like highest length+stride, if batch > 1
+ std::vector<size_t> length;
+ std::vector<size_t> stride;
+ if(params.nbatch > 1)
+ {
+ length.push_back(params.nbatch);
+ stride.push_back(params.odist);
+ }
+
+ auto olength = params.olength();
+ std::copy(olength.begin(), olength.end(), std::back_inserter(length));
+ std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride));
+
+ if(params.precision == fft_precision_single)
+ {
+ if(params.otype == fft_array_type_real)
+ check_single_output_stride<float>(output, 0, length, stride, 0);
+ else
+ check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0);
+ }
+ else
+ {
+ if(params.otype == fft_array_type_real)
+ check_single_output_stride<double>(output, 0, length, stride, 0);
+ else
+ check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0);
+ }
+}
+
+// run rocFFT inverse transform
+template <class Tparams>
+inline void run_round_trip_inverse(Tparams& params,
+ std::vector<gpubuf>& obuffer,
+ std::vector<void*>& pibuffer,
+ std::vector<void*>& pobuffer,
+ std::vector<hostbuf>& gpu_output)
+{
+ params.validate();
+
+ // Make sure that the parameters make sense:
+ ASSERT_TRUE(params.valid(verbose));
+
+ // Create FFT plan - this will also allocate work buffer, but will throw a
+ // specific exception if that step fails
+ auto plan_status = fft_status_success;
+ try
+ {
+ plan_status = params.create_plan();
+ }
+ catch(fft_params::work_buffer_alloc_failure& e)
+ {
+ std::stringstream ss;
+ ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << ss.str();
+ }
+ else
+ {
+ GTEST_FAIL() << ss.str();
+ }
+ }
+ ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed";
+
+ auto obuffer_sizes = params.obuffer_sizes();
+
+ if(params.placement != fft_placement_inplace)
+ {
+ for(unsigned int i = 0; i < obuffer_sizes.size(); ++i)
+ {
+ // If we're validating output strides, init the
+ // output buffer to a known pattern and we can check
+ // that the pattern is untouched in places that
+ // shouldn't have been touched.
+ if(params.check_output_strides)
+ {
+ auto hip_status
+ = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << "hipMemset failure";
+ }
+ else
+ {
+ GTEST_FAIL() << "hipMemset failure";
+ }
+ }
+ }
+ }
+ }
+
+ // execute GPU transform
+ execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true);
+}
+
+// compare rocFFT inverse transform with forward transform input
+template <class Tparams>
+inline void compare_round_trip_inverse(Tparams& params,
+ fft_params& contiguous_params,
+ std::vector<hostbuf>& gpu_output,
+ std::vector<hostbuf>& cpu_input,
+ const VectorNorms& cpu_input_norm,
+ size_t total_length)
+{
+ if(params.check_output_strides)
+ {
+ check_output_strides<Tparams>(gpu_output, params);
+ }
+
+ // compute GPU output norm
+ std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() {
+ return norm(gpu_output,
+ params.olength(),
+ params.nbatch,
+ params.precision,
+ params.otype,
+ params.ostride,
+ params.odist,
+ params.ooffset);
+ });
+
+ // compare GPU inverse output to CPU forward input
+ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
+ if(verbose > 1)
+ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
+ const double linf_cutoff
+ = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length);
+
+ VectorNorms diff = distance(cpu_input,
+ gpu_output,
+ params.olength(),
+ params.nbatch,
+ params.precision,
+ contiguous_params.itype,
+ contiguous_params.istride,
+ contiguous_params.idist,
+ params.otype,
+ params.ostride,
+ params.odist,
+ linf_failures.get(),
+ linf_cutoff,
+ {0},
+ params.ooffset,
+ 1.0 / total_length);
+
+ if(verbose > 1)
+ {
+ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
+ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n";
+ std::cout << "GPU linf norm failures:";
+ std::sort(linf_failures->begin(), linf_failures->end());
+ for(const auto& i : *linf_failures)
+ {
+ std::cout << " (" << i.first << "," << i.second << ")";
+ }
+ std::cout << std::endl;
+ }
+
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
+
+ switch(params.precision)
+ {
+ case fft_precision_half:
+ max_linf_eps_half
+ = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+ max_l2_eps_half
+ = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+ break;
+ case fft_precision_single:
+ max_linf_eps_single
+ = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+ max_l2_eps_single
+ = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+ break;
+ case fft_precision_double:
+ max_linf_eps_double
+ = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+ max_l2_eps_double
+ = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+ break;
+ }
+
+ if(verbose > 1)
+ {
+ std::cout << "L2 diff: " << diff.l_2 << "\n";
+ std::cout << "Linf diff: " << diff.l_inf << "\n";
+ }
+
+ EXPECT_TRUE(diff.l_inf <= linf_cutoff)
+ << "Linf test failed. Linf:" << diff.l_inf
+ << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff
+ << params.str();
+
+ EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2
+ < sqrt(log2(total_length)) * type_epsilon(params.precision))
+ << "L2 test failed. L2: " << diff.l_2
+ << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2
+ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
+ << params.str();
+}
+
+// RAII type to put data into the cache when this object leaves scope
+struct StoreCPUDataToCache
+{
+ StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output)
+ : cpu_input(cpu_input)
+ , cpu_output(cpu_output)
+ {
+ }
+ ~StoreCPUDataToCache()
+ {
+ last_cpu_fft_data.cpu_output.swap(cpu_output);
+ last_cpu_fft_data.cpu_input.swap(cpu_input);
+ }
+ std::vector<hostbuf>& cpu_input;
+ std::vector<hostbuf>& cpu_output;
+};
+
+// run CPU + rocFFT transform with the given params and compare
+template <class Tfloat, class Tparams>
+inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
+{
+ // Call hipGetLastError to reset any errors
+ // returned by previous HIP runtime API calls.
+ hipError_t hip_status = hipGetLastError();
+
+ // Make sure that the parameters make sense:
+ ASSERT_TRUE(params.valid(verbose));
+
+ size_t needed_ram = needed_ram_buffers(params, verbose);
+
+ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
+ {
+ GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb
+ << ".\n";
+ }
+
+ auto ibuffer_sizes = params.ibuffer_sizes();
+ auto obuffer_sizes = params.obuffer_sizes();
+
+ size_t vram_avail = 0;
+
+ if(vramgb == 0)
+ {
+ // Check free and total available memory:
+ size_t free = 0;
+ size_t total = 0;
+ auto hip_status = hipMemGetInfo(&free, &total);
+ if(hip_status != hipSuccess || total == 0)
+ {
+ ++n_hip_failures;
+ std::stringstream ss;
+ if(total == 0)
+ ss << "hipMemGetInfo claims there there isn't any vram";
+ else
+ ss << "hipMemGetInfo failure with error " << hip_status;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << ss.str();
+ }
+ else
+ {
+ GTEST_FAIL() << ss.str();
+ }
+ }
+ vram_avail = total;
+ }
+ else
+ {
+ vram_avail = vramgb * ONE_GiB;
+ }
+
+ // First try a quick estimation of vram footprint, to speed up skipping tests
+ // that are too large to fit in the gpu (no plan created with the rocFFT backend)
+ const auto raw_vram_footprint
+ = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
+
+ if(!vram_fits_problem(raw_vram_footprint, vram_avail))
+ {
+ GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint)
+ << " GiB) raw data too large for device";
+ }
+
+ if(verbose > 2)
+ {
+ std::cout << "Raw problem size: " << raw_vram_footprint << std::endl;
+ }
+
+ // If it passed the quick estimation test, go for the more
+ // accurate calculation that actually creates the plan and
+ // take into account the work buffer size
+ const auto vram_footprint = params.vram_footprint();
+ if(!vram_fits_problem(vram_footprint, vram_avail))
+ {
+ if(verbose)
+ {
+ std::cout << "Problem raw data won't fit on device; skipped." << std::endl;
+ }
+ GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint)
+ << " GiB) raw data too large for device";
+ }
+
+ // Create FFT plan - this will also allocate work buffer, but
+ // will throw a specific exception if that step fails
+ auto plan_status = fft_status_success;
+ try
+ {
+ plan_status = params.create_plan();
+ }
+ catch(fft_params::work_buffer_alloc_failure& e)
+ {
+ ++n_hip_failures;
+ std::stringstream ss;
+ ss << "Work buffer allocation failed with size: " << params.workbuffersize;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << ss.str();
+ }
+ else
+ {
+ GTEST_FAIL() << ss.str();
+ }
+ }
+ ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed";
+
+ if(!vram_fits_problem(vram_footprint, vram_avail))
+ {
+ if(verbose)
+ {
+ std::cout << "Problem won't fit on device; skipped." << std::endl;
+ }
+ GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device";
+ return;
+ }
+
+ fft_params contiguous_params;
+ contiguous_params.length = params.length;
+ contiguous_params.precision = params.precision;
+ contiguous_params.placement = fft_placement_notinplace;
+ contiguous_params.transform_type = params.transform_type;
+ contiguous_params.nbatch = params.nbatch;
+ contiguous_params.itype = contiguous_itype(params.transform_type);
+ contiguous_params.otype = contiguous_otype(contiguous_params.transform_type);
+
+ contiguous_params.validate();
+
+ if(!contiguous_params.valid(verbose))
+ {
+ throw std::runtime_error("Invalid contiguous params");
+ }
+
+ if(verbose > 3)
+ {
+ std::cout << "CPU params:\n";
+ std::cout << contiguous_params.str("\n\t") << std::endl;
+ }
+
+ std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
+ std::vector<void*> pibuffer(ibuffer_sizes.size());
+ for(unsigned int i = 0; i < ibuffer.size(); ++i)
+ {
+ hip_status = ibuffer[i].alloc(ibuffer_sizes[i]);
+ if(hip_status != hipSuccess)
+ {
+ std::stringstream ss;
+ ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "("
+ << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)"
+ << " with code " << hipError_to_string(hip_status);
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << ss.str();
+ }
+ else
+ {
+ GTEST_FAIL() << ss.str();
+ }
+ }
+ pibuffer[i] = ibuffer[i].data();
+ }
+
+ // allocation counts in elements, ibuffer_sizes is in bytes
+ auto ibuffer_sizes_elems = ibuffer_sizes;
+ for(auto& buf : ibuffer_sizes_elems)
+ buf /= var_size<size_t>(params.precision, params.itype);
+
+ // Check cache first - nbatch is a >= comparison because we compute
+ // the largest batch size and cache it. Smaller batch runs can
+ // compare against the larger data.
+ std::vector<hostbuf> cpu_input;
+ std::vector<hostbuf> cpu_output;
+ std::shared_future<void> convert_cpu_output_precision;
+ std::shared_future<void> convert_cpu_input_precision;
+ bool run_fftw = true;
+ std::unique_ptr<StoreCPUDataToCache> store_to_cache;
+ if(fftw_compare && last_cpu_fft_data.length == params.length
+ && last_cpu_fft_data.transform_type == params.transform_type
+ && last_cpu_fft_data.run_callbacks == params.run_callbacks)
+ {
+ if(last_cpu_fft_data.nbatch >= params.nbatch)
+ {
+ // use the cached input/output
+ cpu_input.swap(last_cpu_fft_data.cpu_input);
+ cpu_output.swap(last_cpu_fft_data.cpu_output);
+ run_fftw = false;
+
+ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
+
+ if(params.precision != last_cpu_fft_data.precision)
+ {
+ // Tests should be ordered so we do wider first, then narrower.
+ switch(params.precision)
+ {
+ case fft_precision_double:
+ std::cerr
+ << "test ordering is incorrect: double precision follows a narrower one"
+ << std::endl;
+ abort();
+ break;
+ case fft_precision_single:
+ if(last_cpu_fft_data.precision != fft_precision_double)
+ {
+ std::cerr
+ << "test ordering is incorrect: float precision follows a narrower one"
+ << std::endl;
+ abort();
+ }
+ // convert the input/output to single-precision
+ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+ narrow_precision_inplace<double, float>(cpu_output.front());
+ });
+ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
+ narrow_precision_inplace<double, float>(cpu_input.front());
+ });
+ break;
+ case fft_precision_half:
+ // convert to half precision
+ if(last_cpu_fft_data.precision == fft_precision_double)
+ {
+ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+ narrow_precision_inplace<double, _Float16>(cpu_output.front());
+ });
+ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
+ narrow_precision_inplace<double, _Float16>(cpu_input.front());
+ });
+ }
+ else if(last_cpu_fft_data.precision == fft_precision_single)
+ {
+ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+ narrow_precision_inplace<float, _Float16>(cpu_output.front());
+ });
+ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
+ narrow_precision_inplace<float, _Float16>(cpu_input.front());
+ });
+ }
+ else
+ {
+ std::cerr << "unhandled previous precision, cannot convert to half"
+ << std::endl;
+ abort();
+ }
+ break;
+ }
+ last_cpu_fft_data.precision = params.precision;
+ }
+ }
+ // If the last result has a smaller batch than the new
+ // params, that might be a developer error - tests should be
+ // ordered to generate the bigger batch first. But if tests
+ // got filtered or skipped due to insufficient memory, we
+ // might never have tried to generate the bigger batch first.
+ // So just fall through and redo the CPU FFT.
+ }
+ else
+ {
+ // Clear cache explicitly so that even if we didn't get a hit,
+ // we're not uselessly holding on to cached cpu input/output
+ last_cpu_fft_data = last_cpu_fft_cache();
+ }
+
+ // Allocate CPU input
+ if(run_fftw)
+ {
+ cpu_input = allocate_cpu_fft_buffer(
+ contiguous_params.precision, contiguous_params.itype, contiguous_params.isize);
+ }
+
+ // Create FFTW plan - this may write to input, but that's fine
+ // since there's nothing in there right now
+ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan = nullptr;
+ if(run_fftw)
+ {
+ // Normally, we would want to defer allocation of CPU output
+ // buffer until when we actually do the CPU FFT. But if we're
+ // using FFTW wisdom, FFTW needs an output buffer at plan
+ // creation time.
+ if(use_fftw_wisdom)
+ {
+ cpu_output = allocate_cpu_fft_buffer(
+ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
+ }
+ cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length,
+ contiguous_params.istride,
+ contiguous_params.ostride,
+ contiguous_params.nbatch,
+ contiguous_params.idist,
+ contiguous_params.odist,
+ contiguous_params.transform_type,
+ cpu_input,
+ cpu_output);
+
+ needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose);
+
+ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
+ {
+ if(verbose)
+ {
+ std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]."
+ << std::endl;
+ }
+ GTEST_SKIP();
+ return;
+ }
+ }
+
+ std::vector<hostbuf> gpu_input_data;
+
+ // allocate and populate the input buffer (cpu/gpu)
+ if(run_fftw)
+ {
+ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
+
+ //generate the input directly on the gpu
+ params.compute_input(ibuffer);
+
+ // Copy the input to CPU
+ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
+ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
+ {
+ // Copy input to CPU
+ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
+ {
+ hip_status = hipMemcpy(gpu_input_data.at(idx).data(),
+ ibuffer[idx].data(),
+ ibuffer_sizes[idx],
+ hipMemcpyDeviceToHost);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+ }
+ else
+ {
+ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+ }
+ }
+ }
+
+ copy_buffers(gpu_input_data,
+ cpu_input,
+ params.ilength(),
+ params.nbatch,
+ params.precision,
+ params.itype,
+ params.istride,
+ params.idist,
+ contiguous_params.itype,
+ contiguous_params.istride,
+ contiguous_params.idist,
+ params.ioffset,
+ contiguous_params.ioffset);
+ }
+ else
+ {
+ // Copy input to CPU
+ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
+ {
+ hip_status = hipMemcpy(cpu_input.at(idx).data(),
+ ibuffer[idx].data(),
+ ibuffer_sizes[idx],
+ hipMemcpyDeviceToHost);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+ }
+ else
+ {
+ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+ }
+ }
+ }
+ }
+ }
+ else if(fftw_compare)
+ {
+ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
+
+ // In case the cached cpu input needed conversion, wait for it
+ if(convert_cpu_input_precision.valid())
+ convert_cpu_input_precision.get();
+
+ // gets a pre-computed gpu input buffer from the cpu cache
+ std::vector<hostbuf>* gpu_input = &cpu_input;
+
+ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
+ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
+ {
+ copy_buffers(cpu_input,
+ gpu_input_data,
+ params.ilength(),
+ params.nbatch,
+ params.precision,
+ contiguous_params.itype,
+ contiguous_params.istride,
+ contiguous_params.idist,
+ params.itype,
+ params.istride,
+ params.idist,
+ {0},
+ params.ioffset);
+ gpu_input = &gpu_input_data;
+ }
+
+ // Copy input to GPU
+ for(unsigned int idx = 0; idx < gpu_input->size(); ++idx)
+ {
+ hip_status = hipMemcpy(ibuffer[idx].data(),
+ gpu_input->at(idx).data(),
+ ibuffer_sizes[idx],
+ hipMemcpyHostToDevice);
+
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+ }
+ else
+ {
+ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+ }
+ }
+ }
+ }
+
+ if(verbose > 3)
+ {
+ std::cout << "CPU input:\n";
+ contiguous_params.print_ibuffer(cpu_input);
+ }
+
+ // compute input norm
+ std::shared_future<VectorNorms> cpu_input_norm;
+ if(fftw_compare)
+ cpu_input_norm = std::async(std::launch::async, [&]() {
+ // in case the cached cpu input needed conversion, wait for it
+ if(convert_cpu_input_precision.valid())
+ convert_cpu_input_precision.get();
+
+ auto input_norm = norm(cpu_input,
+ contiguous_params.ilength(),
+ contiguous_params.nbatch,
+ contiguous_params.precision,
+ contiguous_params.itype,
+ contiguous_params.istride,
+ contiguous_params.idist,
+ contiguous_params.ioffset);
+ if(verbose > 2)
+ {
+ std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n";
+ std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n";
+ }
+ return input_norm;
+ });
+
+ std::vector<gpubuf> obuffer_data;
+ std::vector<gpubuf>* obuffer = &obuffer_data;
+ std::vector<void*> pobuffer;
+
+ // allocate the output buffer
+
+ if(params.placement == fft_placement_inplace)
+ {
+ obuffer = &ibuffer;
+ }
+ else
+ {
+ auto obuffer_sizes = params.obuffer_sizes();
+ obuffer_data.resize(obuffer_sizes.size());
+ for(unsigned int i = 0; i < obuffer_data.size(); ++i)
+ {
+ hip_status = obuffer_data[i].alloc(obuffer_sizes[i]);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ std::stringstream ss;
+ ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
+ << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)"
+ << " with code " << hipError_to_string(hip_status);
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << ss.str();
+ }
+ else
+ {
+ GTEST_FAIL() << ss.str();
+ }
+ }
+
+ // If we're validating output strides, init the
+ // output buffer to a known pattern and we can check
+ // that the pattern is untouched in places that
+ // shouldn't have been touched.
+ if(params.check_output_strides)
+ {
+ hip_status
+ = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
+ if(hip_status != hipSuccess)
+ {
+ ++n_hip_failures;
+ if(skip_runtime_fails)
+ {
+ GTEST_SKIP() << "hipMemset failure with error " << hip_status;
+ }
+ else
+ {
+ GTEST_FAIL() << "hipMemset failure with error " << hip_status;
+ }
+ }
+ }
+ }
+ }
+ pobuffer.resize(obuffer->size());
+ for(unsigned int i = 0; i < obuffer->size(); ++i)
+ {
+ pobuffer[i] = obuffer->at(i).data();
+ }
+
+ // Run CPU transform
+ //
+ // NOTE: This must happen after input is copied to GPU and input
+ // norm is computed, since the CPU FFT may overwrite the input.
+ VectorNorms cpu_output_norm;
+ std::shared_future<void> cpu_fft;
+ if(fftw_compare)
+ cpu_fft = std::async(std::launch::async, [&]() {
+ // wait for input norm to finish, since we might overwrite input
+ cpu_input_norm.get();
+
+ if(run_fftw)
+ execute_cpu_fft<Tfloat>(params, contiguous_params, cpu_plan, cpu_input, cpu_output);
+ // in case the cached cpu output needed conversion, wait for it
+ else if(convert_cpu_output_precision.valid())
+ convert_cpu_output_precision.get();
+
+ if(verbose > 3)
+ {
+ std::cout << "CPU output:\n";
+ contiguous_params.print_obuffer(cpu_output);
+ }
+
+ cpu_output_norm = norm(cpu_output,
+ params.olength(),
+ params.nbatch,
+ params.precision,
+ contiguous_params.otype,
+ contiguous_params.ostride,
+ contiguous_params.odist,
+ contiguous_params.ooffset);
+ if(verbose > 2)
+ {
+ std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n";
+ std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n";
+ }
+ });
+
+ // scatter data out to multi-GPUs if this is a multi-GPU test
+ params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
+
+ // execute GPU transform
+ std::vector<hostbuf> gpu_output
+ = allocate_host_buffer(params.precision, params.otype, params.osize);
+
+ execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output);
+
+ params.free();
+
+ if(params.check_output_strides)
+ {
+ check_output_strides<Tparams>(gpu_output, params);
+ }
+
+ // compute GPU output norm
+ std::shared_future<VectorNorms> gpu_norm;
+ if(fftw_compare)
+ gpu_norm = std::async(std::launch::async, [&]() {
+ return norm(gpu_output,
+ params.olength(),
+ params.nbatch,
+ params.precision,
+ params.otype,
+ params.ostride,
+ params.odist,
+ params.ooffset);
+ });
+
+ // compare output
+ //
+ // Compute the l-infinity and l-2 distance between the CPU and GPU output:
+ // wait for cpu FFT so we can compute cutoff
+
+ const auto total_length = std::accumulate(params.length.begin(),
+ params.length.end(),
+ static_cast<size_t>(1),
+ std::multiplies<size_t>());
+
+ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
+ if(verbose > 1)
+ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
+ double linf_cutoff;
+ VectorNorms diff;
+
+ std::shared_future<void> compare_output;
+ if(fftw_compare)
+ compare_output = std::async(std::launch::async, [&]() {
+ cpu_fft.get();
+ linf_cutoff
+ = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
+
+ diff = distance(cpu_output,
+ gpu_output,
+ params.olength(),
+ params.nbatch,
+ params.precision,
+ contiguous_params.otype,
+ contiguous_params.ostride,
+ contiguous_params.odist,
+ params.otype,
+ params.ostride,
+ params.odist,
+ linf_failures.get(),
+ linf_cutoff,
+ {0},
+ params.ooffset);
+ });
+
+ // Update the cache if this current transform is different from
+ // what's stored. But if this transform only has a smaller batch
+ // than what's cached, we can still keep the cache around since
+ // the input/output we already have is still valid.
+ const bool update_last_cpu_fft_data
+ = last_cpu_fft_data.length != params.length
+ || last_cpu_fft_data.transform_type != params.transform_type
+ || last_cpu_fft_data.run_callbacks != params.run_callbacks
+ || last_cpu_fft_data.precision != params.precision
+ || params.nbatch > last_cpu_fft_data.nbatch;
+
+ // store cpu output in cache
+ if(update_last_cpu_fft_data)
+ {
+ last_cpu_fft_data.length = params.length;
+ last_cpu_fft_data.nbatch = params.nbatch;
+ last_cpu_fft_data.transform_type = params.transform_type;
+ last_cpu_fft_data.run_callbacks = params.run_callbacks;
+ last_cpu_fft_data.precision = params.precision;
+ }
+
+ if(compare_output.valid())
+ compare_output.get();
+
+ if(!store_to_cache)
+ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
+
+ Tparams params_inverse;
+
+ if(round_trip)
+ {
+ params_inverse.inverse_from_forward(params);
+
+ run_round_trip_inverse<Tparams>(
+ params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data);
+ }
+
+ if(fftw_compare)
+ {
+ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2));
+ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf));
+
+ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2));
+ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf));
+
+ if(verbose > 1)
+ {
+ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
+ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n";
+ std::cout << "GPU linf norm failures:";
+ std::sort(linf_failures->begin(), linf_failures->end());
+ for(const auto& i : *linf_failures)
+ {
+ std::cout << " (" << i.first << "," << i.second << ")";
+ }
+ std::cout << std::endl;
+ }
+
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
+ }
+
+ switch(params.precision)
+ {
+ case fft_precision_half:
+ max_linf_eps_half
+ = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+ max_l2_eps_half
+ = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+ break;
+ case fft_precision_single:
+ max_linf_eps_single
+ = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+ max_l2_eps_single = std::max(max_l2_eps_single,
+ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+ break;
+ case fft_precision_double:
+ max_linf_eps_double
+ = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+ max_l2_eps_double = std::max(max_l2_eps_double,
+ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+ break;
+ }
+
+ if(verbose > 1)
+ {
+ std::cout << "L2 diff: " << diff.l_2 << "\n";
+ std::cout << "Linf diff: " << diff.l_inf << "\n";
+ }
+
+ if(fftw_compare)
+ {
+ EXPECT_TRUE(diff.l_inf <= linf_cutoff)
+ << "Linf test failed. Linf:" << diff.l_inf
+ << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf
+ << "\tcutoff: " << linf_cutoff << params.str();
+
+ EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2
+ < sqrt(log2(total_length)) * type_epsilon(params.precision))
+ << "L2 test failed. L2: " << diff.l_2
+ << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2
+ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
+ << params.str();
+ }
+
+ if(round_trip && fftw_compare)
+ {
+ compare_round_trip_inverse<Tparams>(params_inverse,
+ contiguous_params,
+ gpu_input_data,
+ cpu_input,
+ cpu_input_norm.get(),
+ total_length);
+ }
+}
+
+#endif
diff --git a/shared/arithmetic.h b/shared/arithmetic.h
new file mode 100644
index 0000000..774d342
--- /dev/null
+++ b/shared/arithmetic.h
@@ -0,0 +1,61 @@
+/******************************************************************************
+* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#pragma once
+
+#include <numeric>
+#include <stddef.h>
+
+// arithmetic helper functions
+
+static inline bool IsPo2(size_t u)
+{
+ return (u != 0) && (0 == (u & (u - 1)));
+}
+
+// help function: Find the smallest power of 2 that is >= n; return its
+// power of 2 factor
+// e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
+static inline size_t CeilPo2(size_t n)
+{
+ size_t v = 1, t = 0;
+ while(v < n)
+ {
+ v <<= 1;
+ t++;
+ }
+
+ return t;
+}
+
+template <typename T>
+static inline T DivRoundingUp(T a, T b)
+{
+ return (a + (b - 1)) / b;
+}
+
+template <typename Titer>
+typename Titer::value_type product(Titer begin, Titer end)
+{
+ return std::accumulate(
+ begin, end, typename Titer::value_type(1), std::multiplies<typename Titer::value_type>());
+}
diff --git a/shared/array_predicate.h b/shared/array_predicate.h
new file mode 100644
index 0000000..92e45b4
--- /dev/null
+++ b/shared/array_predicate.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_ARRAY_PREDICATE_H
+#define ROCFFT_ARRAY_PREDICATE_H
+
+#include "rocfft/rocfft.h"
+
+namespace
+{
+ bool array_type_is_complex(rocfft_array_type type)
+ {
+ return type == rocfft_array_type_complex_interleaved
+ || type == rocfft_array_type_complex_planar
+ || type == rocfft_array_type_hermitian_interleaved
+ || type == rocfft_array_type_hermitian_planar;
+ }
+ bool array_type_is_interleaved(rocfft_array_type type)
+ {
+ return type == rocfft_array_type_complex_interleaved
+ || type == rocfft_array_type_hermitian_interleaved;
+ }
+ bool array_type_is_planar(rocfft_array_type type)
+ {
+ return type == rocfft_array_type_complex_planar
+ || type == rocfft_array_type_hermitian_planar;
+ }
+}
+
+#endif
diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp
new file mode 100644
index 0000000..70abb08
--- /dev/null
+++ b/shared/array_validator.cpp
@@ -0,0 +1,549 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <iostream>
+#include <numeric>
+#include <unordered_set>
+
+#include "array_validator.h"
+#include "increment.h"
+
+// Check a 2D array for collisions.
+// The 2D case can be determined via a number-theoretic argument.
+bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1)
+{
+ if(s0 == s1)
+ return false;
+ const auto c = std::lcm(s0, s1);
+ return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c));
+}
+
+// Compare a 1D direction with a multi-index hyperface for collisions.
+bool valid_length_stride_1d_multi(const unsigned int idx,
+ const std::vector<size_t> l,
+ const std::vector<size_t> s,
+ const int verbose)
+{
+ size_t l0{0}, s0{0};
+ std::vector<size_t> l1{}, s1{};
+ for(unsigned int i = 0; i < l.size(); ++i)
+ {
+ if(i == idx)
+ {
+ l0 = l[i];
+ s0 = s[i];
+ }
+ else
+ {
+ l1.push_back(l[i]);
+ s1.push_back(s[i]);
+ }
+ }
+
+ if(verbose > 4)
+ {
+ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
+ }
+
+ // We only need to go to the maximum pointer offset for (l1,s1).
+ const auto max_offset
+ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
+ - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
+ std::unordered_set<size_t> a0{};
+ for(size_t i = 1; i < l0; ++i)
+ {
+ const auto val = i * s0;
+ if(val <= max_offset)
+ a0.insert(val);
+ else
+ break;
+ }
+
+ if(verbose > 5)
+ {
+ std::cout << "a0:";
+ for(auto i : a0)
+ std::cout << " " << i;
+ std::cout << std::endl;
+
+ std::cout << "l1:";
+ for(auto i : l1)
+ std::cout << " " << i;
+ std::cout << std::endl;
+
+ std::cout << "s1:";
+ for(auto i : s1)
+ std::cout << " " << i;
+ std::cout << std::endl;
+ }
+
+ // TODO: this can be multi-threaded, since find(...) is thread-safe.
+ std::vector<size_t> index(l1.size());
+ std::fill(index.begin(), index.end(), 0);
+ do
+ {
+ const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0);
+ if(i > 0 && (i % s0 == 0))
+ {
+ // TODO: use an ordered set and binary search
+ if(verbose > 6)
+ std::cout << i << std::endl;
+ if(a0.find(i) != a0.end())
+ {
+ if(verbose > 4)
+ {
+ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
+ std::cout << "l1:";
+ for(const auto li : l1)
+ std::cout << " " << li;
+ std::cout << " s1:";
+ for(const auto si : s1)
+ std::cout << " " << si;
+ std::cout << std::endl;
+ std::cout << "Found duplicate: " << i << std::endl;
+ }
+ return false;
+ }
+ }
+ } while(increment_rowmajor(index, l1));
+
+ return true;
+}
+
+// Compare a hyperface with another hyperface for collisions.
+bool valid_length_stride_multi_multi(const std::vector<size_t> l0,
+ const std::vector<size_t> s0,
+ const std::vector<size_t> l1,
+ const std::vector<size_t> s1)
+{
+ std::unordered_set<size_t> a0{};
+
+ const auto max_offset
+ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
+ - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
+ std::vector<size_t> index0(l0.size()); // TODO: check this
+ std::fill(index0.begin(), index0.end(), 0);
+ do
+ {
+ const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0);
+ if(i > max_offset)
+ a0.insert(i);
+ } while(increment_rowmajor(index0, l0));
+
+ std::vector<size_t> index1(l1.size());
+ std::fill(index1.begin(), index1.end(), 0);
+ do
+ {
+ const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0);
+ if(i > 0)
+ {
+ // TODO: use an ordered set and binary search
+ if(a0.find(i) != a0.end())
+ {
+
+ return false;
+ }
+ }
+ } while(increment_rowmajor(index1, l1));
+
+ return true;
+}
+
+bool valid_length_stride_3d(const std::vector<size_t>& l,
+ const std::vector<size_t>& s,
+ const int verbose)
+{
+ // Check that 2D faces are valid:
+ if(!valid_length_stride_2d(l[0], l[1], s[0], s[1]))
+ return false;
+ if(!valid_length_stride_2d(l[0], l[2], s[0], s[2]))
+ return false;
+ if(!valid_length_stride_2d(l[1], l[2], s[1], s[2]))
+ return false;
+
+ // If the 2D faces are valid, check an axis vs a face for collisions:
+ bool invalid = false;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for(int idx = 0; idx < 3; ++idx)
+ {
+ if(!valid_length_stride_1d_multi(idx, l, s, verbose))
+ {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+ invalid = true;
+ }
+ }
+ if(invalid)
+ return false;
+ return true;
+}
+
+bool valid_length_stride_4d(const std::vector<size_t>& l,
+ const std::vector<size_t>& s,
+ const int verbose)
+{
+ if(l.size() != 4)
+ {
+ throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d");
+ }
+
+ // Check that 2D faces are valid:
+ for(int idx0 = 0; idx0 < 3; ++idx0)
+ {
+ for(int idx1 = idx0 + 1; idx1 < 4; ++idx1)
+ {
+ if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1]))
+ return false;
+ }
+ }
+
+ bool invalid = false;
+ // Check that 1D vs 3D faces are valid:
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for(int idx0 = 0; idx0 < 4; ++idx0)
+ {
+ if(!valid_length_stride_1d_multi(idx0, l, s, verbose))
+ {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+ invalid = true;
+ }
+ }
+ if(invalid)
+ return false;
+
+ // Check that 2D vs 2D faces are valid:
+
+ // First, get all the permutations
+ std::vector<std::vector<size_t>> perms;
+ std::vector<size_t> v(l.size());
+ std::fill(v.begin(), v.begin() + 2, 0);
+ std::fill(v.begin() + 2, v.end(), 1);
+ do
+ {
+ perms.push_back(v);
+ if(verbose > 3)
+ {
+ std::cout << "v:";
+ for(const auto i : v)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ }
+ } while(std::next_permutation(v.begin(), v.end()));
+
+ // Then loop over all of the permutations.
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for(size_t iperm = 0; iperm < perms.size(); ++iperm)
+ {
+ std::vector<size_t> l0(2);
+ std::vector<size_t> s0(2);
+ std::vector<size_t> l1(2);
+ std::vector<size_t> s1(2);
+ for(size_t i = 0; i < l.size(); ++i)
+ {
+ if(perms[iperm][i] == 0)
+ {
+ l0.push_back(l[i]);
+ s0.push_back(s[i]);
+ }
+ else
+ {
+ l1.push_back(l[i]);
+ s1.push_back(s[i]);
+ }
+ }
+
+ if(verbose > 3)
+ {
+ std::cout << "\tl0:";
+ for(const auto i : l0)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ std::cout << "\ts0:";
+ for(const auto i : s0)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ std::cout << "\tl1:";
+ for(const auto i : l1)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ std::cout << "\ts1:";
+ for(const auto i : s1)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ }
+
+ if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
+ {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+ invalid = true;
+ }
+ }
+ if(invalid)
+ return false;
+
+ return true;
+}
+
+bool valid_length_stride_generald(const std::vector<size_t> l,
+ const std::vector<size_t> s,
+ const int verbose)
+{
+ if(verbose > 2)
+ {
+ std::cout << "checking dimension " << l.size() << std::endl;
+ }
+
+ // Recurse on d-1 hyper-faces:
+ for(unsigned int idx = 0; idx < l.size(); ++idx)
+ {
+ std::vector<size_t> l0{};
+ std::vector<size_t> s0{};
+ for(size_t i = 0; i < l.size(); ++i)
+ {
+ if(i != idx)
+ {
+ l0.push_back(l[i]);
+ s0.push_back(s[i]);
+ }
+ }
+ if(!array_valid(l0, s0, verbose))
+ return false;
+ }
+
+ // Handle the 1D vs (N-1) case:
+ for(unsigned int idx = 0; idx < l.size(); ++idx)
+ {
+ if(!valid_length_stride_1d_multi(idx, l, s, verbose))
+ return false;
+ }
+
+ for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0)
+ {
+ const size_t dim1 = l.size() - dim0;
+ if(verbose > 2)
+ std::cout << "dims: " << dim0 << " " << dim1 << std::endl;
+
+ // We iterate over all permutations of an array of length l.size() which contains dim0 zeros
+ // and dim1 ones. We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the
+ // possibilities.
+
+ // First, get all the permutations
+ std::vector<std::vector<size_t>> perms;
+ std::vector<size_t> v(l.size());
+ std::fill(v.begin(), v.begin() + dim1, 0);
+ std::fill(v.begin() + dim1, v.end(), 1);
+ do
+ {
+ perms.push_back(v);
+ if(verbose > 3)
+ {
+ std::cout << "v:";
+ for(const auto i : v)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ }
+
+ } while(std::next_permutation(v.begin(), v.end()));
+
+ bool invalid = false;
+ // Then loop over all of the permutations.
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for(size_t iperm = 0; iperm < perms.size(); ++iperm)
+ {
+ std::vector<size_t> l0(dim0);
+ std::vector<size_t> s0(dim0);
+ std::vector<size_t> l1(dim1);
+ std::vector<size_t> s1(dim1);
+
+ for(size_t i = 0; i < l.size(); ++i)
+ {
+ if(v[i] == 0)
+ {
+ l0.push_back(l[i]);
+ s0.push_back(s[i]);
+ }
+ else
+ {
+ l1.push_back(l[i]);
+ s1.push_back(s[i]);
+ }
+ }
+
+ if(verbose > 3)
+ {
+ std::cout << "\tl0:";
+ for(const auto i : l0)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ std::cout << "\ts0:";
+ for(const auto i : s0)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ std::cout << "\tl1:";
+ for(const auto i : l1)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ std::cout << "\ts1:";
+ for(const auto i : s1)
+ {
+ std::cout << " " << i;
+ }
+ std::cout << "\n";
+ }
+
+ if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
+ {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+ invalid = true;
+ }
+ }
+ if(invalid)
+ return false;
+ }
+
+ return true;
+}
+
+bool sort_by_stride(const std::pair<size_t, size_t>& ls0, const std::pair<size_t, size_t>& ls1)
+{
+ return ls0.second < ls1.second;
+}
+
+bool array_valid(const std::vector<size_t>& length,
+ const std::vector<size_t>& stride,
+ const int verbose)
+{
+ if(length.size() != stride.size())
+ return false;
+
+ // If a length is 1, then the stride is irrelevant.
+ // If a length is > 1, then the corresponding stride must be > 1.
+ std::vector<size_t> l{}, s{};
+ for(unsigned int i = 0; i < length.size(); ++i)
+ {
+ if(length[i] > 1)
+ {
+ if(stride[i] == 0)
+ return false;
+ l.push_back(length[i]);
+ s.push_back(stride[i]);
+ }
+ }
+
+ if(length.size() > 1)
+ {
+ // Check happy path.
+ bool happy_path = true;
+ std::vector<std::pair<size_t, size_t>> ls;
+ for(size_t idx = 0; idx < length.size(); ++idx)
+ {
+ ls.push_back(std::pair(length[idx], stride[idx]));
+ }
+ std::sort(ls.begin(), ls.end(), sort_by_stride);
+
+ if(verbose > 2)
+ {
+ for(size_t idx = 0; idx < ls.size(); ++idx)
+ {
+ std::cout << ls[idx].first << "\t" << ls[idx].second << "\n";
+ }
+ }
+
+ for(size_t idx = 1; idx < ls.size(); ++idx)
+ {
+ if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second)
+ {
+ happy_path = false;
+ break;
+ }
+ }
+ if(happy_path)
+ {
+ if(verbose > 2)
+ {
+ std::cout << "happy path\n";
+ }
+ return true;
+ }
+ }
+
+ switch(l.size())
+ {
+ case 0:
+ return true;
+ break;
+ case 1:
+ return s[0] != 0;
+ break;
+ case 2:
+ {
+ return valid_length_stride_2d(l[0], l[1], s[0], s[1]);
+ break;
+ }
+ case 3:
+ {
+ return valid_length_stride_3d(l, s, verbose);
+ break;
+ }
+ case 4:
+ {
+ return valid_length_stride_4d(l, s, verbose);
+ break;
+ }
+ default:
+ return valid_length_stride_generald(l, s, verbose);
+ return true;
+ }
+
+ return true;
+}
diff --git a/shared/array_validator.h b/shared/array_validator.h
new file mode 100644
index 0000000..ce85173
--- /dev/null
+++ b/shared/array_validator.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ARRAY_VALIDATOR_H
+#define ARRAY_VALIDATOR_H
+
+#include <vector>
+
+// Checks whether the array with given length and stride has multi-index collisions.
+bool array_valid(const std::vector<size_t>& length,
+ const std::vector<size_t>& stride,
+ const int verbose = 0);
+
+#endif
diff --git a/shared/concurrency.h b/shared/concurrency.h
new file mode 100644
index 0000000..a36c7c1
--- /dev/null
+++ b/shared/concurrency.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include <thread>
+
+#ifndef WIN32
+#include <sched.h>
+#endif
+
+// work out how many parallel tasks to run, based on available
+// resources. on Linux, this will look at the cpu affinity mask (if
+// available) which might be restricted in a container. otherwise,
+// return std::thread::hardware_concurrency().
+static unsigned int rocfft_concurrency()
+{
+#ifndef WIN32
+ cpu_set_t cpuset;
+ if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0)
+ return CPU_COUNT(&cpuset);
+#endif
+ return std::thread::hardware_concurrency();
+}
diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h
new file mode 100644
index 0000000..77fb012
--- /dev/null
+++ b/shared/data_gen_device.h
@@ -0,0 +1,1303 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef DATA_GEN_DEVICE_H
+#define DATA_GEN_DEVICE_H
+
+// rocRAND can generate warnings if inline asm is not available for
+// some architectures. data generation isn't performance-critical,
+// so just disable inline asm to prevent the warnings.
+#define ROCRAND_DISABLE_INLINE_ASM
+
+#include "../shared/arithmetic.h"
+#include "../shared/device_properties.h"
+#include "../shared/gpubuf.h"
+#include "../shared/increment.h"
+#include "../shared/rocfft_complex.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+#include <limits>
+#include <vector>
+
+static const unsigned int DATA_GEN_THREADS = 8;
+static const unsigned int DATA_GEN_GRID_Y_MAX = 64;
+
+template <typename T>
+struct input_val_1D
+{
+ T val1;
+};
+
+template <typename T>
+struct input_val_2D
+{
+ T val1;
+ T val2;
+};
+
+template <typename T>
+struct input_val_3D
+{
+ T val1;
+ T val2;
+ T val3;
+};
+
+template <typename T>
+static input_val_1D<T> get_input_val(const T& val)
+{
+ return input_val_1D<T>{val};
+}
+
+template <typename T>
+static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
+{
+ return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
+}
+
+template <typename T>
+static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
+{
+ return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
+}
+
+template <typename T>
+__device__ static size_t
+ compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
+{
+ return (length.val1 * stride.val1) + base;
+}
+
+template <typename T>
+__device__ static size_t
+ compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
+{
+ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
+}
+
+template <typename T>
+__device__ static size_t
+ compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
+{
+ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
+ + base;
+}
+
+template <typename T>
+static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
+{
+ return input_val_1D<T>{0};
+}
+
+template <typename T>
+static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
+{
+ return input_val_2D<T>{0, 0};
+}
+
+template <typename T>
+static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
+{
+ return input_val_3D<T>{0, 0, 0};
+}
+
+template <typename T>
+static inline input_val_1D<T> make_unit_stride(const input_val_1D<T>& whole_length)
+{
+ return input_val_1D<T>{1};
+}
+
+template <typename T>
+static inline input_val_2D<T> make_unit_stride(const input_val_2D<T>& whole_length)
+{
+ return input_val_2D<T>{1, whole_length.val1};
+}
+
+template <typename T>
+static inline input_val_3D<T> make_unit_stride(const input_val_3D<T>& whole_length)
+{
+ return input_val_3D<T>{1, whole_length.val1, whole_length.val1 * whole_length.val2};
+}
+
+template <typename T>
+__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
+{
+ auto xlen = whole_length.val1;
+
+ auto xidx = i % xlen;
+
+ return input_val_1D<T>{xidx};
+}
+
+template <typename T>
+__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
+{
+ auto xlen = whole_length.val1;
+ auto ylen = whole_length.val2;
+
+ auto xidx = i % xlen;
+ auto yidx = i / xlen % ylen;
+
+ return input_val_2D<T>{xidx, yidx};
+}
+
+template <typename T>
+__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
+{
+ auto xlen = whole_length.val1;
+ auto ylen = whole_length.val2;
+ auto zlen = whole_length.val3;
+
+ auto xidx = i % xlen;
+ auto yidx = i / xlen % ylen;
+ auto zidx = i / xlen / ylen % zlen;
+
+ return input_val_3D<T>{xidx, yidx, zidx};
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
+{
+ auto xlen = whole_length.val1;
+
+ auto yidx = i / xlen;
+
+ return yidx;
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
+{
+ auto xlen = whole_length.val1;
+ auto ylen = whole_length.val2;
+
+ auto zidx = i / xlen / ylen;
+
+ return zidx;
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
+{
+ auto xlen = length.val1;
+ auto ylen = length.val2;
+ auto zlen = length.val3;
+
+ auto widx = i / xlen / ylen / zlen;
+
+ return widx;
+}
+
+__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset)
+{
+ return hiprand_uniform_double(gen_state) + offset;
+}
+
+__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset)
+{
+ return hiprand_uniform(gen_state) + offset;
+}
+
+__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset)
+{
+ return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset;
+}
+
+template <typename Tcomplex>
+__device__ static void set_imag_zero(const size_t pos, Tcomplex* x)
+{
+ x[pos].y = 0.0;
+}
+
+template <typename Tfloat>
+__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag)
+{
+ ximag[pos] = 0.0;
+}
+
+template <typename Tcomplex>
+__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x)
+{
+ x[pos].x = x[cpos].x;
+ x[pos].y = -x[cpos].y;
+}
+
+template <typename Tfloat>
+__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag)
+{
+ xreal[pos] = xreal[cpos];
+ ximag[pos] = -ximag[cpos];
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+ generate_random_interleaved_data_kernel(const Tint whole_length,
+ const Tint zero_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint istride,
+ rocfft_complex<Treal>* data)
+{
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+ static_assert(sizeof(i) >= sizeof(isize));
+ if(i < isize)
+ {
+ auto i_length = get_length(i, whole_length);
+ auto i_batch = get_batch(i, whole_length);
+ auto i_base = i_batch * idist;
+
+ auto seed = compute_index(zero_length, istride, i_base);
+ auto idx = compute_index(i_length, istride, i_base);
+
+ hiprandStatePhilox4_32_10 gen_state;
+ hiprand_init(seed, idx, 0, &gen_state);
+
+ data[idx].x = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+ data[idx].y = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+ }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+ generate_interleaved_data_kernel(const Tint whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint istride,
+ const Tint ustride,
+ const Treal inv_scale,
+ rocfft_complex<Treal>* data)
+{
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+ static_assert(sizeof(i) >= sizeof(isize));
+ if(i < isize)
+ {
+ const auto i_length = get_length(i, whole_length);
+ const auto i_batch = get_batch(i, whole_length);
+ const auto i_base = i_batch * idist;
+
+ const auto val = static_cast<Treal>(-0.5)
+ + static_cast<Treal>(
+ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
+ * inv_scale;
+
+ const auto idx = compute_index(i_length, istride, i_base);
+
+ data[idx].x = val;
+ data[idx].y = val;
+ }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+ generate_random_planar_data_kernel(const Tint whole_length,
+ const Tint zero_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint istride,
+ Treal* real_data,
+ Treal* imag_data)
+{
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+ static_assert(sizeof(i) >= sizeof(isize));
+ if(i < isize)
+ {
+ auto i_length = get_length(i, whole_length);
+ auto i_batch = get_batch(i, whole_length);
+ auto i_base = i_batch * idist;
+
+ auto seed = compute_index(zero_length, istride, i_base);
+ auto idx = compute_index(i_length, istride, i_base);
+
+ hiprandStatePhilox4_32_10 gen_state;
+ hiprand_init(seed, idx, 0, &gen_state);
+
+ real_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+ imag_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+ }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+ generate_planar_data_kernel(const Tint whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint istride,
+ const Tint ustride,
+ const Treal inv_scale,
+ Treal* real_data,
+ Treal* imag_data)
+{
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+ static_assert(sizeof(i) >= sizeof(isize));
+ if(i < isize)
+ {
+ const auto i_length = get_length(i, whole_length);
+ const auto i_batch = get_batch(i, whole_length);
+ const auto i_base = i_batch * idist;
+
+ const auto val = static_cast<Treal>(-0.5)
+ + static_cast<Treal>(
+ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
+ * inv_scale;
+
+ const auto idx = compute_index(i_length, istride, i_base);
+
+ real_data[idx] = val;
+ imag_data[idx] = val;
+ }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+ generate_random_real_data_kernel(const Tint whole_length,
+ const Tint zero_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint istride,
+ Treal* data)
+{
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+ static_assert(sizeof(i) >= sizeof(isize));
+ if(i < isize)
+ {
+ auto i_length = get_length(i, whole_length);
+ auto i_batch = get_batch(i, whole_length);
+ auto i_base = i_batch * idist;
+
+ auto seed = compute_index(zero_length, istride, i_base);
+ auto idx = compute_index(i_length, istride, i_base);
+
+ hiprandStatePhilox4_32_10 gen_state;
+ hiprand_init(seed, idx, 0, &gen_state);
+
+ data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+ }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+ generate_real_data_kernel(const Tint whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint istride,
+ const Tint ustride,
+ const Treal inv_scale,
+ Treal* data)
+{
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+ static_assert(sizeof(i) >= sizeof(isize));
+ if(i < isize)
+ {
+ const auto i_length = get_length(i, whole_length);
+ const auto i_batch = get_batch(i, whole_length);
+ const auto i_base = i_batch * idist;
+
+ const auto val = static_cast<Treal>(-0.5)
+ + static_cast<Treal>(
+ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
+ * inv_scale;
+
+ const auto idx = compute_index(i_length, istride, i_base);
+
+ data[idx] = val;
+ }
+}
+
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
+// space. For multi-dimensional data, this means that we only need to store a bit more
+// than half of the complex values; the rest are redundant. However, there are still
+// some restrictions:
+// * the origin and Nyquist value(s) must be real-valued
+// * some of the remaining values are still redundant, and you might get different results
+// than you expect if the values don't agree.
+
+template <typename Tcomplex>
+__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex* x,
+ const size_t Nx,
+ const size_t xstride,
+ const size_t dist,
+ const size_t batch_total,
+ const bool Nxeven)
+{
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+ static_assert(sizeof(id_batch) == sizeof(size_t));
+
+ if(id_batch < batch_total)
+ {
+ id_batch *= dist;
+
+ set_imag_zero(id_batch, x);
+
+ if(Nxeven)
+ set_imag_zero(id_batch + (Nx / 2) * xstride, x);
+ }
+}
+
+template <typename Tfloat>
+__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat* xreal,
+ Tfloat* ximag,
+ const size_t Nx,
+ const size_t xstride,
+ const size_t dist,
+ const size_t batch_total,
+ const bool Nxeven)
+{
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+ static_assert(sizeof(id_batch) == sizeof(size_t));
+
+ if(id_batch < batch_total)
+ {
+ id_batch *= dist;
+
+ set_imag_zero(id_batch, xreal, ximag);
+
+ if(Nxeven)
+ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
+ }
+}
+
+template <typename Tcomplex>
+__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex* x,
+ const size_t Nx,
+ const size_t Ny,
+ const size_t xstride,
+ const size_t ystride,
+ const size_t dist,
+ const size_t batch_total,
+ const size_t x_total,
+ const bool Nxeven,
+ const bool Nyeven)
+{
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+ static_assert(sizeof(id_batch) == sizeof(size_t));
+ static_assert(sizeof(id_x) == sizeof(size_t));
+
+ if(id_batch < batch_total)
+ {
+ id_batch *= dist;
+
+ if(id_x == 0)
+ set_imag_zero(id_batch, x);
+
+ if(id_x == 0 && Nxeven)
+ set_imag_zero(id_batch + (Nx / 2) * xstride, x);
+
+ if(id_x == 0 && Nyeven)
+ set_imag_zero(id_batch + ystride * (Ny / 2), x);
+
+ if(id_x == 0 && Nxeven && Nyeven)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
+
+ if(id_x < x_total)
+ {
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
+
+ if(Nyeven)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+ x);
+ }
+ }
+}
+
+template <typename Tfloat>
+__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat* xreal,
+ Tfloat* ximag,
+ const size_t Nx,
+ const size_t Ny,
+ const size_t xstride,
+ const size_t ystride,
+ const size_t dist,
+ const size_t batch_total,
+ const size_t x_total,
+ const bool Nxeven,
+ const bool Nyeven)
+{
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+ static_assert(sizeof(id_batch) == sizeof(size_t));
+ static_assert(sizeof(id_x) == sizeof(size_t));
+
+ if(id_batch < batch_total)
+ {
+ id_batch *= dist;
+
+ if(id_x == 0)
+ set_imag_zero(id_batch, xreal, ximag);
+
+ if(id_x == 0 && Nxeven)
+ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
+
+ if(id_x == 0 && Nyeven)
+ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
+
+ if(id_x == 0 && Nxeven && Nyeven)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
+
+ if(id_x < x_total)
+ {
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)),
+ id_batch + xstride * (id_x + 1),
+ xreal,
+ ximag);
+
+ if(Nyeven)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+ xreal,
+ ximag);
+ }
+ }
+}
+
+template <typename Tcomplex>
+__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex* x,
+ const size_t Nx,
+ const size_t Ny,
+ const size_t Nz,
+ const size_t xstride,
+ const size_t ystride,
+ const size_t zstride,
+ const size_t dist,
+ const size_t batch_total,
+ const size_t x_total,
+ const size_t y_total,
+ const size_t y_total_half,
+ const bool Nxeven,
+ const bool Nyeven,
+ const bool Nzeven)
+{
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
+ static_assert(sizeof(id_batch) == sizeof(size_t));
+ static_assert(sizeof(id_x) == sizeof(size_t));
+ static_assert(sizeof(id_y) == sizeof(size_t));
+
+ if(id_batch < batch_total)
+ {
+ auto id_x_y_zero = (id_x == 0 && id_y == 0);
+
+ id_batch *= dist;
+
+ if(id_x_y_zero)
+ set_imag_zero(id_batch, x);
+
+ if(Nxeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2), x);
+
+ if(Nyeven && id_x_y_zero)
+ set_imag_zero(id_batch + ystride * (Ny / 2), x);
+
+ if(Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + zstride * (Nz / 2), x);
+
+ if(Nxeven && Nyeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
+
+ if(Nxeven && Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x);
+
+ if(Nyeven && Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x);
+
+ if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
+ x);
+
+ if(id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x);
+
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
+ x);
+
+ if(id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
+
+ if(Nyeven && id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+ x);
+
+ if(id_x < x_total && id_y < y_total)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
+ x);
+
+ if(Nzeven)
+ {
+ if(id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+ x);
+
+ if(Nyeven && id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+ x);
+
+ if(id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
+ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
+ x);
+
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
+ + zstride * (Nz / 2),
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
+ x);
+
+ if(id_x < x_total && id_y < y_total)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
+ + zstride * (Nz / 2),
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
+ + zstride * (Nz / 2),
+ x);
+ }
+ }
+}
+
+template <typename Tfloat>
+__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat* xreal,
+ Tfloat* ximag,
+ const size_t Nx,
+ const size_t Ny,
+ const size_t Nz,
+ const size_t xstride,
+ const size_t ystride,
+ const size_t zstride,
+ const size_t dist,
+ const size_t batch_total,
+ const size_t x_total,
+ const size_t y_total,
+ const size_t y_total_half,
+ const bool Nxeven,
+ const bool Nyeven,
+ const bool Nzeven)
+{
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
+ static_assert(sizeof(id_batch) == sizeof(size_t));
+ static_assert(sizeof(id_x) == sizeof(size_t));
+ static_assert(sizeof(id_y) == sizeof(size_t));
+
+ if(id_batch < batch_total)
+ {
+ auto id_x_y_zero = (id_x == 0 && id_y == 0);
+
+ id_batch *= dist;
+
+ if(id_x_y_zero)
+ set_imag_zero(id_batch, xreal, ximag);
+
+ if(Nxeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag);
+
+ if(Nyeven && id_x_y_zero)
+ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
+
+ if(Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag);
+
+ if(Nxeven && Nyeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
+
+ if(Nxeven && Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag);
+
+ if(Nyeven && Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag);
+
+ if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
+ xreal,
+ ximag);
+
+ if(id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)),
+ id_batch + ystride * (id_y + 1),
+ xreal,
+ ximag);
+
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
+ xreal,
+ ximag);
+
+ if(id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)),
+ id_batch + xstride * (id_x + 1),
+ xreal,
+ ximag);
+
+ if(Nyeven && id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+ xreal,
+ ximag);
+
+ if(id_x < x_total && id_y < y_total)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
+ xreal,
+ ximag);
+
+ if(Nzeven)
+ {
+ if(id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+ xreal,
+ ximag);
+
+ if(Nyeven && id_x < x_total && id_y == 0)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+ xreal,
+ ximag);
+
+ if(id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
+ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
+ xreal,
+ ximag);
+
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
+ + zstride * (Nz / 2),
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
+ xreal,
+ ximag);
+
+ if(id_x < x_total && id_y < y_total)
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
+ + zstride * (Nz / 2),
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
+ + zstride * (Nz / 2),
+ xreal,
+ ximag);
+ }
+ }
+}
+
+// get grid dimensions for data gen kernel
+static dim3 generate_data_gridDim(const size_t isize)
+{
+ auto blockSize = DATA_GEN_THREADS;
+ // total number of blocks needed in the grid
+ auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
+
+ // Total work items per dimension in the grid is counted in
+ // uint32_t. Since each thread initializes one element, very
+ // large amounts of data will overflow this total size if we do
+ // all this work in one grid dimension, causing launch failure.
+ //
+ // CUDA also generally allows for effectively unlimited grid X
+ // dim, but Y and Z are more limited.
+ auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup);
+ auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX);
+ return {gridDim_x, gridDim_y};
+}
+
+// get grid dimensions for hermitian symmetrizer kernel
+static dim3 generate_hermitian_gridDim(const std::vector<size_t>& length,
+ const size_t batch,
+ const size_t blockSize)
+{
+ dim3 gridDim;
+
+ switch(length.size())
+ {
+ case 1:
+ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
+ break;
+ case 2:
+ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
+ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize));
+ break;
+ case 3:
+ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
+ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize),
+ DivRoundingUp<size_t>(length[1] - 1, blockSize));
+ break;
+ default:
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+ }
+
+ return gridDim;
+}
+
+static dim3 generate_blockDim(const std::vector<size_t>& length, const size_t blockSize)
+{
+ dim3 blockDim;
+
+ switch(length.size())
+ {
+ case 1:
+ blockDim = dim3(blockSize);
+ break;
+ case 2:
+ blockDim = dim3(blockSize, blockSize);
+ break;
+ case 3:
+ blockDim = dim3(blockSize, blockSize, blockSize);
+ break;
+ default:
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+ }
+
+ return blockDim;
+}
+
+template <typename Tint, typename Treal>
+static void generate_random_interleaved_data(const Tint& whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint& whole_stride,
+ rocfft_complex<Treal>* input_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ auto input_length = get_input_val(whole_length);
+ auto zero_length = make_zero_length(input_length);
+ auto input_stride = get_input_val(whole_stride);
+
+ dim3 gridDim = generate_data_gridDim(isize);
+ dim3 blockDim{DATA_GEN_THREADS};
+
+ launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(
+ HIP_KERNEL_NAME(generate_random_interleaved_data_kernel<decltype(input_length), Treal>),
+ gridDim,
+ blockDim,
+ 0, // sharedMemBytes
+ 0, // stream
+ input_length,
+ zero_length,
+ idist,
+ isize,
+ input_stride,
+ input_data);
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_interleaved_data(const Tint& whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint& whole_stride,
+ const size_t nbatch,
+ rocfft_complex<Treal>* input_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ const auto input_length = get_input_val(whole_length);
+ const auto input_stride = get_input_val(whole_stride);
+ const auto unit_stride = make_unit_stride(input_length);
+
+ const auto inv_scale
+ = static_cast<Treal>(1.0)
+ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
+
+ dim3 gridDim = generate_data_gridDim(isize);
+ dim3 blockDim{DATA_GEN_THREADS};
+
+ launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(
+ HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>),
+ gridDim,
+ blockDim,
+ 0, // sharedMemBytes
+ 0, // stream
+ input_length,
+ idist,
+ isize,
+ input_stride,
+ unit_stride,
+ inv_scale,
+ input_data);
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("generate_interleaved_data_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_random_planar_data(const Tint& whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint& whole_stride,
+ Treal* real_data,
+ Treal* imag_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ const auto input_length = get_input_val(whole_length);
+ const auto zero_length = make_zero_length(input_length);
+ const auto input_stride = get_input_val(whole_stride);
+
+ dim3 gridDim = generate_data_gridDim(isize);
+ dim3 blockDim{DATA_GEN_THREADS};
+
+ launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(
+ HIP_KERNEL_NAME(generate_random_planar_data_kernel<decltype(input_length), Treal>),
+ gridDim,
+ blockDim,
+ 0, // sharedMemBytes
+ 0, // stream
+ input_length,
+ zero_length,
+ idist,
+ isize,
+ input_stride,
+ real_data,
+ imag_data);
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("generate_random_planar_data_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_planar_data(const Tint& whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint& whole_stride,
+ const size_t nbatch,
+ Treal* real_data,
+ Treal* imag_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ const auto input_length = get_input_val(whole_length);
+ const auto input_stride = get_input_val(whole_stride);
+ const auto unit_stride = make_unit_stride(input_length);
+
+ const auto inv_scale
+ = static_cast<Treal>(1.0)
+ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
+
+ dim3 gridDim = generate_data_gridDim(isize);
+ dim3 blockDim{DATA_GEN_THREADS};
+
+ launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>),
+ gridDim,
+ blockDim,
+ 0, // sharedMemBytes
+ 0, // stream
+ input_length,
+ idist,
+ isize,
+ input_stride,
+ unit_stride,
+ inv_scale,
+ real_data,
+ imag_data);
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("generate_planar_data_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_random_real_data(const Tint& whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint& whole_stride,
+ Treal* input_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ const auto input_length = get_input_val(whole_length);
+ const auto zero_length = make_zero_length(input_length);
+ const auto input_stride = get_input_val(whole_stride);
+
+ dim3 gridDim = generate_data_gridDim(isize);
+ dim3 blockDim{DATA_GEN_THREADS};
+
+ launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(
+ HIP_KERNEL_NAME(generate_random_real_data_kernel<decltype(input_length), Treal>),
+ gridDim,
+ blockDim,
+ 0, // sharedMemBytes
+ 0, // stream
+ input_length,
+ zero_length,
+ idist,
+ isize,
+ input_stride,
+ input_data);
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("generate_random_real_data_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_real_data(const Tint& whole_length,
+ const size_t idist,
+ const size_t isize,
+ const Tint& whole_stride,
+ const size_t nbatch,
+ Treal* input_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ const auto input_length = get_input_val(whole_length);
+ const auto input_stride = get_input_val(whole_stride);
+ const auto unit_stride = make_unit_stride(input_length);
+
+ const auto inv_scale
+ = static_cast<Treal>(1.0)
+ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
+
+ dim3 gridDim = generate_data_gridDim(isize);
+ dim3 blockDim{DATA_GEN_THREADS};
+
+ launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>),
+ gridDim,
+ blockDim,
+ 0, // sharedMemBytes
+ 0, // stream
+ input_length,
+ idist,
+ isize,
+ input_stride,
+ unit_stride,
+ inv_scale,
+ input_data);
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("generate_real_data_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tcomplex>
+static void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
+ const std::vector<size_t>& ilength,
+ const std::vector<size_t>& stride,
+ const size_t dist,
+ const size_t batch,
+ Tcomplex* input_data,
+ const hipDeviceProp_t& deviceProp)
+{
+ auto blockSize = DATA_GEN_THREADS;
+ auto blockDim = generate_blockDim(length, blockSize);
+ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize);
+
+ switch(length.size())
+ {
+ case 1:
+ {
+ launch_limits_check(
+ "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel<Tcomplex>,
+ gridDim,
+ blockDim,
+ 0,
+ 0,
+ input_data,
+ length[0],
+ stride[0],
+ dist,
+ batch,
+ length[0] % 2 == 0);
+
+ break;
+ }
+ case 2:
+ {
+ launch_limits_check(
+ "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel<Tcomplex>,
+ gridDim,
+ blockDim,
+ 0,
+ 0,
+ input_data,
+ length[0],
+ length[1],
+ stride[0],
+ stride[1],
+ dist,
+ batch,
+ (ilength[0] + 1) / 2 - 1,
+ length[0] % 2 == 0,
+ length[1] % 2 == 0);
+
+ break;
+ }
+ case 3:
+ {
+ launch_limits_check(
+ "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel<Tcomplex>,
+ gridDim,
+ blockDim,
+ 0,
+ 0,
+ input_data,
+ length[0],
+ length[1],
+ length[2],
+ stride[0],
+ stride[1],
+ stride[2],
+ dist,
+ batch,
+ (ilength[0] + 1) / 2 - 1,
+ ilength[1] - 1,
+ (ilength[1] + 1) / 2 - 1,
+ length[0] % 2 == 0,
+ length[1] % 2 == 0,
+ length[2] % 2 == 0);
+ break;
+ }
+ default:
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+ }
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tfloat>
+static void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
+ const std::vector<size_t>& ilength,
+ const std::vector<size_t>& stride,
+ const size_t dist,
+ const size_t batch,
+ Tfloat* input_data_real,
+ Tfloat* input_data_imag,
+ const hipDeviceProp_t& deviceProp)
+{
+ auto blockSize = DATA_GEN_THREADS;
+ auto blockDim = generate_blockDim(length, blockSize);
+ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize);
+
+ switch(length.size())
+ {
+ case 1:
+ {
+ launch_limits_check(
+ "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel<Tfloat>,
+ gridDim,
+ blockDim,
+ 0,
+ 0,
+ input_data_real,
+ input_data_imag,
+ length[0],
+ stride[0],
+ dist,
+ batch,
+ length[0] % 2 == 0);
+
+ break;
+ }
+ case 2:
+ {
+ launch_limits_check(
+ "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel<Tfloat>,
+ gridDim,
+ blockDim,
+ 0,
+ 0,
+ input_data_real,
+ input_data_imag,
+ length[0],
+ length[1],
+ stride[0],
+ stride[1],
+ dist,
+ batch,
+ (ilength[0] + 1) / 2 - 1,
+ length[0] % 2 == 0,
+ length[1] % 2 == 0);
+
+ break;
+ }
+ case 3:
+ {
+ launch_limits_check(
+ "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp);
+
+ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel<Tfloat>,
+ gridDim,
+ blockDim,
+ 0,
+ 0,
+ input_data_real,
+ input_data_imag,
+ length[0],
+ length[1],
+ length[2],
+ stride[0],
+ stride[1],
+ stride[2],
+ dist,
+ batch,
+ (ilength[0] + 1) / 2 - 1,
+ ilength[1] - 1,
+ (ilength[1] + 1) / 2 - 1,
+ length[0] % 2 == 0,
+ length[1] % 2 == 0,
+ length[2] % 2 == 0);
+ break;
+ }
+ default:
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+ }
+ auto err = hipGetLastError();
+ if(err != hipSuccess)
+ throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: "
+ + std::string(hipGetErrorName(err)));
+}
+
+#endif // DATA_GEN_DEVICE_H
diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h
new file mode 100644
index 0000000..29d3854
--- /dev/null
+++ b/shared/data_gen_host.h
@@ -0,0 +1,881 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef DATA_GEN_HOST_H
+#define DATA_GEN_HOST_H
+
+#include "../shared/hostbuf.h"
+#include "../shared/increment.h"
+#include <complex>
+#include <limits>
+#include <random>
+#include <tuple>
+#include <vector>
+
+// Specialized computation of index given 1-, 2-, 3- dimension length + stride
+template <typename T1, typename T2>
+size_t compute_index(T1 length, T2 stride, size_t base)
+{
+ return (length * stride) + base;
+}
+
+template <typename T1, typename T2>
+size_t
+ compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
+{
+ static_assert(std::is_integral<T1>::value, "Integral required.");
+ static_assert(std::is_integral<T2>::value, "Integral required.");
+ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
+ + base;
+}
+
+template <typename T1, typename T2>
+size_t compute_index(const std::tuple<T1, T1, T1>& length,
+ const std::tuple<T2, T2, T2>& stride,
+ size_t base)
+{
+ static_assert(std::is_integral<T1>::value, "Integral required.");
+ static_assert(std::is_integral<T2>::value, "Integral required.");
+ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
+ + (std::get<2>(length) * std::get<2>(stride)) + base;
+}
+
+// count the number of total iterations for 1-, 2-, and 3-D dimensions
+template <typename T1>
+size_t count_iters(const T1& i)
+{
+ return i;
+}
+
+template <typename T1>
+size_t count_iters(const std::tuple<T1, T1>& i)
+{
+ return std::get<0>(i) * std::get<1>(i);
+}
+
+template <typename T1>
+size_t count_iters(const std::tuple<T1, T1, T1>& i)
+{
+ return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
+}
+
+template <typename T1>
+T1 make_unit_stride(const T1& whole_length)
+{
+ return static_cast<T1>(1);
+}
+
+template <typename T1>
+std::tuple<T1, T1> make_unit_stride(const std::tuple<T1, T1>& whole_length)
+{
+ return std::make_tuple(static_cast<T1>(1), static_cast<T1>(std::get<0>(whole_length)));
+}
+
+template <typename T1>
+std::tuple<T1, T1, T1> make_unit_stride(const std::tuple<T1, T1, T1>& whole_length)
+{
+ return std::make_tuple(static_cast<T1>(1),
+ static_cast<T1>(std::get<0>(whole_length)),
+ static_cast<T1>(std::get<0>(whole_length))
+ * static_cast<T1>(std::get<1>(whole_length)));
+}
+
+// Work out how many partitions to break our iteration problem into
+template <typename T1>
+static size_t compute_partition_count(T1 length)
+{
+#ifdef _OPENMP
+ // we seem to get contention from too many threads, which slows
+ // things down. particularly noticeable with mix_3D tests
+ static const size_t MAX_PARTITIONS = 8;
+ size_t iters = count_iters(length);
+ size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
+ if(!hw_threads)
+ return 1;
+
+ // don't bother threading problem sizes that are too small. pick
+ // an arbitrary number of iterations and ensure that each thread
+ // has at least that many iterations to process
+ static const size_t MIN_ITERS_PER_THREAD = 2048;
+
+ // either use the whole CPU, or use ceil(iters/iters_per_thread)
+ return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
+#else
+ return 1;
+#endif
+}
+
+// Break a scalar length into some number of pieces, returning
+// [(start0, end0), (start1, end1), ...]
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
+{
+ static_assert(std::is_integral<T1>::value, "Integral required.");
+
+ // make sure we don't exceed the length
+ num_parts = std::min(length, num_parts);
+
+ std::vector<std::pair<T1, T1>> ret(num_parts);
+ auto partition_size = length / num_parts;
+ T1 cur_partition = 0;
+ for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
+ {
+ ret[i].first = cur_partition;
+ ret[i].second = cur_partition + partition_size;
+ }
+ // last partition might not divide evenly, fix it up
+ ret.back().second = length;
+ return ret;
+}
+
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
+{
+ return partition_base(length, compute_partition_count(length));
+}
+
+// Partition on the leftmost part of the tuple, for row-major indexing
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
+ partition_rowmajor(const std::tuple<T1, T1>& length)
+{
+ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
+ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
+ for(size_t i = 0; i < partitions.size(); ++i)
+ {
+ std::get<0>(ret[i].first) = partitions[i].first;
+ std::get<1>(ret[i].first) = 0;
+ std::get<0>(ret[i].second) = partitions[i].second;
+ std::get<1>(ret[i].second) = std::get<1>(length);
+ }
+ return ret;
+}
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
+ partition_rowmajor(const std::tuple<T1, T1, T1>& length)
+{
+ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
+ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
+ for(size_t i = 0; i < partitions.size(); ++i)
+ {
+ std::get<0>(ret[i].first) = partitions[i].first;
+ std::get<1>(ret[i].first) = 0;
+ std::get<2>(ret[i].first) = 0;
+ std::get<0>(ret[i].second) = partitions[i].second;
+ std::get<1>(ret[i].second) = std::get<1>(length);
+ std::get<2>(ret[i].second) = std::get<2>(length);
+ }
+ return ret;
+}
+
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
+// space. For multi-dimensional data, this means that we only need to store a bit more
+// than half of the complex values; the rest are redundant. However, there are still
+// some restrictions:
+// * the origin and Nyquist value(s) must be real-valued
+// * some of the remaining values are still redundant, and you might get different results
+// than you expect if the values don't agree.
+// Below are some example kernels which impose Hermitian symmetry on a complex array
+// of the given dimensions.
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved_1D(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+ {
+ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
+
+ data[0].imag(0.0);
+
+ if(length[0] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2)].imag(0.0);
+ }
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar_1D(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+ {
+ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
+
+ data_imag[0] = 0.0;
+
+ if(length[0] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2)] = 0.0;
+ }
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved_2D(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+ {
+ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
+
+ data[0].imag(0.0);
+
+ if(length[0] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2)].imag(0.0);
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ data[istride[1] * (length[1] / 2)].imag(0.0);
+ }
+
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
+ }
+
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
+ }
+ }
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar_2D(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+ {
+ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
+ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
+
+ data_imag[0] = 0.0;
+
+ if(length[0] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2)] = 0.0;
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ data_imag[istride[1] * (length[1] / 2)] = 0.0;
+ }
+
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
+ }
+
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
+ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+ = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
+ }
+ }
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved_3D(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+ {
+ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
+
+ data[0].imag(0.0);
+
+ if(length[0] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2)].imag(0.0);
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ data[istride[1] * (length[1] / 2)].imag(0.0);
+ }
+
+ if(length[2] % 2 == 0)
+ {
+ data[istride[2] * (length[2] / 2)].imag(0.0);
+ }
+
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
+ }
+
+ if(length[0] % 2 == 0 && length[2] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
+ }
+ if(length[1] % 2 == 0 && length[2] % 2 == 0)
+ {
+ data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
+ }
+
+ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
+ {
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
+ + istride[2] * (length[2] / 2)]
+ .imag(0.0);
+ }
+
+ // y-axis:
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]);
+ }
+
+ if(length[0] % 2 == 0)
+ {
+ // y-axis at x-nyquist
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
+ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]);
+ }
+ }
+
+ // x-axis:
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ // x-axis at y-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
+ }
+ }
+
+ // x-y plane:
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ for(unsigned int j = 1; j < length[1]; ++j)
+ {
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
+ = std::conj(data[istride[0] * i + istride[1] * j]);
+ }
+ }
+
+ if(length[2] % 2 == 0)
+ {
+ // x-axis at z-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
+ }
+ if(length[1] % 2 == 0)
+ {
+ // x-axis at yz-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
+ }
+ }
+
+ // y-axis: at z-nyquist
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
+ = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]);
+ }
+
+ if(length[0] % 2 == 0)
+ {
+ // y-axis: at xz-nyquist
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
+ + istride[2] * (length[2] / 2)]
+ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j
+ + istride[2] * (length[2] / 2)]);
+ }
+ }
+
+ // x-y plane: at z-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ for(unsigned int j = 1; j < length[1]; ++j)
+ {
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
+ + istride[2] * (length[2] / 2)]
+ = std::conj(
+ data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]);
+ }
+ }
+ }
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar_3D(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+ {
+ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
+ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
+
+ data_imag[0] = 0.0;
+
+ if(length[0] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2)] = 0.0;
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ data_imag[istride[1] * (length[1] / 2)] = 0.0;
+ }
+
+ if(length[2] % 2 == 0)
+ {
+ data_imag[istride[2] * (length[2] / 2)] = 0.0;
+ }
+
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
+ }
+
+ if(length[0] % 2 == 0 && length[2] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0;
+ }
+ if(length[1] % 2 == 0 && length[2] % 2 == 0)
+ {
+ data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0;
+ }
+
+ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
+ {
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
+ + istride[2] * (length[2] / 2)]
+ = 0.0;
+ }
+
+ // y-axis:
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j];
+ data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j];
+ }
+
+ if(length[0] % 2 == 0)
+ {
+ // y-axis at x-nyquist
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
+ = data_real[istride[0] * (length[0] / 2) + istride[1] * j];
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
+ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j];
+ }
+ }
+
+ // x-axis:
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
+ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
+ }
+
+ if(length[1] % 2 == 0)
+ {
+ // x-axis at y-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+ = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
+ }
+ }
+
+ // x-y plane:
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ for(unsigned int j = 1; j < length[1]; ++j)
+ {
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
+ = data_real[istride[0] * i + istride[1] * j];
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
+ = -data_imag[istride[0] * i + istride[1] * j];
+ }
+ }
+
+ if(length[2] % 2 == 0)
+ {
+ // x-axis at z-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+ = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
+ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
+ }
+ if(length[1] % 2 == 0)
+ {
+ // x-axis at yz-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+ = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
+ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
+ }
+ }
+
+ // y-axis: at z-nyquist
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
+ = data_real[istride[1] * j + istride[2] * (length[2] / 2)];
+ data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
+ = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)];
+ }
+
+ if(length[0] % 2 == 0)
+ {
+ // y-axis: at xz-nyquist
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+ {
+ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
+ + istride[2] * (length[2] / 2)]
+ = data_real[istride[0] * (length[0] / 2) + istride[1] * j
+ + istride[2] * (length[2] / 2)];
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
+ + istride[2] * (length[2] / 2)]
+ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j
+ + istride[2] * (length[2] / 2)];
+ }
+ }
+
+ // x-y plane: at z-nyquist
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+ {
+ for(unsigned int j = 1; j < length[1]; ++j)
+ {
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
+ + istride[2] * (length[2] / 2)]
+ = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)];
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
+ + istride[2] * (length[2] / 2)]
+ = -data_imag[istride[0] * i + istride[1] * j
+ + istride[2] * (length[2] / 2)];
+ }
+ }
+ }
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_random_interleaved_data(std::vector<hostbuf>& input,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch)
+{
+ auto idata = (std::complex<Tfloat>*)input[0].data();
+ size_t i_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+ {
+#pragma omp parallel for num_threads(partitions.size())
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ std::mt19937 gen(compute_index(index, whole_stride, i_base));
+ do
+ {
+ const auto i = compute_index(index, whole_stride, i_base);
+ const Tfloat x = (Tfloat)gen() / (Tfloat)gen.max();
+ const Tfloat y = (Tfloat)gen() / (Tfloat)gen.max();
+ const std::complex<Tfloat> val(x, y);
+ idata[i] = val;
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_interleaved_data(std::vector<hostbuf>& input,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch)
+{
+ auto idata = (std::complex<Tfloat>*)input[0].data();
+ size_t i_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ auto unit_stride = make_unit_stride(whole_length);
+
+ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
+
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+ {
+#pragma omp parallel for num_threads(partitions.size())
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto val_xy
+ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
+
+ const std::complex<Tfloat> val(val_xy, val_xy);
+
+ const auto i = compute_index(index, whole_stride, i_base);
+
+ idata[i] = val;
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_random_planar_data(std::vector<hostbuf>& input,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch)
+{
+ auto ireal = (Tfloat*)input[0].data();
+ auto iimag = (Tfloat*)input[1].data();
+ size_t i_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+ {
+#pragma omp parallel for num_threads(partitions.size())
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ std::mt19937 gen(compute_index(index, whole_stride, i_base));
+ do
+ {
+ const auto i = compute_index(index, whole_stride, i_base);
+ const std::complex<Tfloat> val((Tfloat)gen() / (Tfloat)gen.max(),
+ (Tfloat)gen() / (Tfloat)gen.max());
+ ireal[i] = val.real();
+ iimag[i] = val.imag();
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_planar_data(std::vector<hostbuf>& input,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch)
+{
+
+ auto ireal = (Tfloat*)input[0].data();
+ auto iimag = (Tfloat*)input[1].data();
+ size_t i_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ auto unit_stride = make_unit_stride(whole_length);
+
+ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
+
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+ {
+#pragma omp parallel for num_threads(partitions.size())
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto val_xy
+ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
+
+ const auto i = compute_index(index, whole_stride, i_base);
+
+ ireal[i] = val_xy;
+ iimag[i] = val_xy;
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_random_real_data(std::vector<hostbuf>& input,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch)
+{
+ auto idata = (Tfloat*)input[0].data();
+ size_t i_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+ {
+#pragma omp parallel for num_threads(partitions.size())
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ std::mt19937 gen(compute_index(index, whole_stride, i_base));
+ do
+ {
+ const auto i = compute_index(index, whole_stride, i_base);
+ const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max();
+ idata[i] = val;
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_real_data(std::vector<hostbuf>& input,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch)
+{
+
+ auto idata = (Tfloat*)input[0].data();
+ size_t i_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ auto unit_stride = make_unit_stride(whole_length);
+
+ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
+
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+ {
+#pragma omp parallel for num_threads(partitions.size())
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto i = compute_index(index, whole_stride, i_base);
+
+ idata[i]
+ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ switch(length.size())
+ {
+ case 1:
+ impose_hermitian_symmetry_interleaved_1D<Tfloat>(vals, length, istride, idist, nbatch);
+ break;
+ case 2:
+ impose_hermitian_symmetry_interleaved_2D<Tfloat>(vals, length, istride, idist, nbatch);
+ break;
+ case 3:
+ impose_hermitian_symmetry_interleaved_3D<Tfloat>(vals, length, istride, idist, nbatch);
+ break;
+ default:
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+ }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar(std::vector<hostbuf>& vals,
+ const std::vector<Tsize>& length,
+ const std::vector<Tsize>& istride,
+ const Tsize idist,
+ const Tsize nbatch)
+{
+ switch(length.size())
+ {
+ case 1:
+ impose_hermitian_symmetry_planar_1D<Tfloat>(vals, length, istride, idist, nbatch);
+ break;
+ case 2:
+ impose_hermitian_symmetry_planar_2D<Tfloat>(vals, length, istride, idist, nbatch);
+ break;
+ case 3:
+ impose_hermitian_symmetry_planar_3D<Tfloat>(vals, length, istride, idist, nbatch);
+ break;
+ default:
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+ }
+}
+
+#endif // DATA_GEN_HOST_H
diff --git a/shared/device_properties.h b/shared/device_properties.h
new file mode 100644
index 0000000..6e2e1e1
--- /dev/null
+++ b/shared/device_properties.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_DEVICE_PROPS_H
+#define ROCFFT_DEVICE_PROPS_H
+
+#include <cstdint>
+#include <hip/hip_runtime_api.h>
+#include <stdexcept>
+
+// get device properties
+static hipDeviceProp_t get_curr_device_prop()
+{
+ hipDeviceProp_t prop;
+ int deviceId = 0;
+ if(hipGetDevice(&deviceId) != hipSuccess)
+ throw std::runtime_error("hipGetDevice failed.");
+
+ if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
+ throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
+ + std::to_string(deviceId));
+
+ return prop;
+}
+
+// check that the given grid/block dims will fit into the limits in
+// the device properties. throws std::runtime_error if the limits
+// are exceeded.
+static void launch_limits_check(const std::string& kernel_name,
+ const dim3 gridDim,
+ const dim3 blockDim,
+ const hipDeviceProp_t& deviceProp)
+{
+ // Need lots of casting here because dim3 is unsigned but device
+ // props are signed. Cast direct comparisons to fix signedness
+ // issues. Promote types to 64-bit when multiplying to try to
+ // avoid overflow.
+
+ // Block limits along each dimension
+ if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0])
+ || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1])
+ || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2]))
+ throw std::runtime_error("max threads per dim exceeded: " + kernel_name);
+
+ // Total threads for the whole block
+ if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z
+ > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock))
+ throw std::runtime_error("max threads per block exceeded: " + kernel_name);
+
+ // Grid dimension limits
+ if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0])
+ || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1])
+ || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2]))
+ throw std::runtime_error("max grid size exceeded: " + kernel_name);
+}
+
+#endif
diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h
new file mode 100644
index 0000000..1c2fba0
--- /dev/null
+++ b/shared/enum_to_string.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ENUM_TO_STRING_H
+#define ENUM_TO_STRING_H
+
+#include "fft_params.h"
+
+// Return the string of the hipError code.
+static std::string hipError_to_string(const hipError_t ret)
+{
+ switch(ret)
+ {
+ case hipSuccess:
+ return "hipSuccess";
+ case hipErrorInvalidContext:
+ return "hipErrorInvalidContext";
+ case hipErrorInvalidKernelFile:
+ return "hipErrorInvalidKernelFile";
+ case hipErrorMemoryAllocation:
+ return "hipErrorMemoryAllocation";
+ case hipErrorInitializationError:
+ return "hipErrorInitializationError";
+ case hipErrorLaunchFailure:
+ return "hipErrorLaunchFailure";
+ case hipErrorLaunchOutOfResources:
+ return "hipErrorLaunchOutOfResources";
+ case hipErrorInvalidDevice:
+ return "hipErrorInvalidDevice";
+ case hipErrorInvalidValue:
+ return "hipErrorInvalidValue";
+ case hipErrorInvalidDevicePointer:
+ return "hipErrorInvalidDevicePointer";
+ case hipErrorInvalidMemcpyDirection:
+ return "hipErrorInvalidMemcpyDirection";
+ case hipErrorUnknown:
+ return "hipErrorUnknown";
+ case hipErrorInvalidResourceHandle:
+ return "hipErrorInvalidResourceHandle";
+ case hipErrorNotReady:
+ return "hipErrorNotReady";
+ case hipErrorNoDevice:
+ return "hipErrorNoDevice";
+ case hipErrorPeerAccessAlreadyEnabled:
+ return "hipErrorPeerAccessAlreadyEnabled";
+ case hipErrorPeerAccessNotEnabled:
+ return "hipErrorPeerAccessNotEnabled";
+ case hipErrorRuntimeMemory:
+ return "hipErrorRuntimeMemory";
+ case hipErrorRuntimeOther:
+ return "hipErrorRuntimeOther";
+ case hipErrorHostMemoryAlreadyRegistered:
+ return "hipErrorHostMemoryAlreadyRegistered";
+ case hipErrorHostMemoryNotRegistered:
+ return "hipErrorHostMemoryNotRegistered";
+ case hipErrorMapBufferObjectFailed:
+ return "hipErrorMapBufferObjectFailed";
+ case hipErrorTbd:
+ return "hipErrorTbd";
+ default:
+ throw std::runtime_error("unknown hipError");
+ }
+}
+#endif
diff --git a/shared/environment.h b/shared/environment.h
new file mode 100644
index 0000000..7be56a0
--- /dev/null
+++ b/shared/environment.h
@@ -0,0 +1,97 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// wrappers around environment variable routines
+
+#pragma once
+
+#include <string>
+
+// Windows provides "getenv" and "_putenv", but those modify the
+// runtime's copy of the environment. The actual environment in the
+// process control block is accessed using GetEnvironmentVariable and
+// SetEnvironmentVariable.
+
+#ifdef WIN32
+#include <windows.h>
+static void rocfft_setenv(const char* var, const char* value)
+{
+ SetEnvironmentVariable(var, value);
+}
+static void rocfft_unsetenv(const char* var)
+{
+ SetEnvironmentVariable(var, nullptr);
+}
+static std::string rocfft_getenv(const char* var)
+{
+ DWORD size = GetEnvironmentVariable(var, nullptr, 0);
+ std::string ret;
+ if(size)
+ {
+ ret.resize(size);
+ GetEnvironmentVariable(var, ret.data(), size);
+ // GetEnvironmentVariable counts the terminating null, so remove it
+ while(!ret.empty() && ret.back() == 0)
+ ret.pop_back();
+ }
+ return ret;
+}
+
+#else
+
+#include <stdlib.h>
+
+static void rocfft_setenv(const char* var, const char* value)
+{
+ setenv(var, value, 1);
+}
+static void rocfft_unsetenv(const char* var)
+{
+ unsetenv(var);
+}
+static std::string rocfft_getenv(const char* var)
+{
+ auto value = getenv(var);
+ return value ? value : "";
+}
+#endif
+
+// RAII object to set an environment variable and restore it to its
+// previous value on destruction
+struct EnvironmentSetTemp
+{
+ EnvironmentSetTemp(const char* _var, const char* val)
+ : var(_var)
+ {
+ auto val_ptr = rocfft_getenv(_var);
+ if(!val_ptr.empty())
+ oldvalue = val_ptr;
+ rocfft_setenv(_var, val);
+ }
+ ~EnvironmentSetTemp()
+ {
+ if(oldvalue.empty())
+ rocfft_unsetenv(var.c_str());
+ else
+ rocfft_setenv(var.c_str(), oldvalue.c_str());
+ }
+ std::string var;
+ std::string oldvalue;
+};
diff --git a/shared/fft_params.h b/shared/fft_params.h
new file mode 100644
index 0000000..bf428ef
--- /dev/null
+++ b/shared/fft_params.h
@@ -0,0 +1,3274 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFT_PARAMS_H
+#define FFT_PARAMS_H
+
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <random>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+#include "../shared/arithmetic.h"
+#include "../shared/array_validator.h"
+#include "../shared/data_gen_device.h"
+#include "../shared/data_gen_host.h"
+#include "../shared/device_properties.h"
+#include "../shared/printbuffer.h"
+#include "../shared/ptrdiff.h"
+
+enum fft_status
+{
+ fft_status_success,
+ fft_status_failure,
+ fft_status_invalid_arg_value,
+ fft_status_invalid_dimensions,
+ fft_status_invalid_array_type,
+ fft_status_invalid_strides,
+ fft_status_invalid_distance,
+ fft_status_invalid_offset,
+ fft_status_invalid_work_buffer,
+};
+
+enum fft_transform_type
+{
+ fft_transform_type_complex_forward,
+ fft_transform_type_complex_inverse,
+ fft_transform_type_real_forward,
+ fft_transform_type_real_inverse,
+};
+
+enum fft_precision
+{
+ fft_precision_half,
+ fft_precision_single,
+ fft_precision_double,
+};
+
+static std::istream& operator>>(std::istream& str, fft_precision& precision)
+{
+ std::string word;
+ str >> word;
+
+ if(word == "half")
+ precision = fft_precision_half;
+ else if(word == "single")
+ precision = fft_precision_single;
+ else if(word == "double")
+ precision = fft_precision_double;
+ else
+ throw std::runtime_error("Invalid precision specified");
+ return str;
+}
+
+// fft_input_generator: linearly spaced sequence in [-0.5,0.5]
+// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5]
+enum fft_input_generator
+{
+ fft_input_random_generator_device,
+ fft_input_random_generator_host,
+ fft_input_generator_device,
+ fft_input_generator_host,
+};
+
+static std::istream& operator>>(std::istream& str, fft_input_generator& gen)
+{
+ std::string word;
+ str >> word;
+
+ if(word == "0")
+ gen = fft_input_random_generator_device;
+ else if(word == "1")
+ gen = fft_input_random_generator_host;
+ else if(word == "2")
+ gen = fft_input_generator_device;
+ else if(word == "3")
+ gen = fft_input_generator_host;
+ else
+ throw std::runtime_error("Invalid input generator specified");
+ return str;
+}
+
+enum fft_array_type
+{
+ fft_array_type_complex_interleaved,
+ fft_array_type_complex_planar,
+ fft_array_type_real,
+ fft_array_type_hermitian_interleaved,
+ fft_array_type_hermitian_planar,
+ fft_array_type_unset,
+};
+
+enum fft_result_placement
+{
+ fft_placement_inplace,
+ fft_placement_notinplace,
+};
+
+// Determine the size of the data type given the precision and type.
+template <typename Tsize>
+inline Tsize var_size(const fft_precision precision, const fft_array_type type)
+{
+ size_t var_size = 0;
+ switch(precision)
+ {
+ case fft_precision_half:
+ var_size = sizeof(_Float16);
+ break;
+ case fft_precision_single:
+ var_size = sizeof(float);
+ break;
+ case fft_precision_double:
+ var_size = sizeof(double);
+ break;
+ }
+ switch(type)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ var_size *= 2;
+ break;
+ default:
+ break;
+ }
+ return var_size;
+}
+// Given an array type and transform length, strides, etc, load random floats in [0,1]
+// into the input array of floats/doubles or complex floats/doubles gpu buffers.
+template <typename Tfloat, typename Tint1>
+inline void set_input(std::vector<gpubuf>& input,
+ const fft_input_generator igen,
+ const fft_array_type itype,
+ const std::vector<size_t>& length,
+ const std::vector<size_t>& ilength,
+ const std::vector<size_t>& istride,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch,
+ const hipDeviceProp_t& deviceProp)
+{
+ auto isize = count_iters(whole_length) * nbatch;
+
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ {
+ auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data();
+
+ if(igen == fft_input_generator_device)
+ generate_interleaved_data(
+ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
+ else if(igen == fft_input_random_generator_device)
+ generate_random_interleaved_data(
+ whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
+
+ if(itype == fft_array_type_hermitian_interleaved)
+ {
+ auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data();
+ impose_hermitian_symmetry_interleaved(
+ length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp);
+ }
+
+ break;
+ }
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ {
+ auto ibuffer_real = (Tfloat*)input[0].data();
+ auto ibuffer_imag = (Tfloat*)input[1].data();
+
+ if(igen == fft_input_generator_device)
+ generate_planar_data(whole_length,
+ idist,
+ isize,
+ whole_stride,
+ nbatch,
+ ibuffer_real,
+ ibuffer_imag,
+ deviceProp);
+ else if(igen == fft_input_random_generator_device)
+ generate_random_planar_data(
+ whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp);
+
+ if(itype == fft_array_type_hermitian_planar)
+ impose_hermitian_symmetry_planar(
+ length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp);
+
+ break;
+ }
+ case fft_array_type_real:
+ {
+ auto ibuffer = (Tfloat*)input[0].data();
+
+ if(igen == fft_input_generator_device)
+ generate_real_data(
+ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
+ else if(igen == fft_input_random_generator_device)
+ generate_random_real_data(
+ whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
+
+ break;
+ }
+ default:
+ throw std::runtime_error("Input layout format not yet supported");
+ }
+}
+
+template <typename Tfloat, typename Tint1>
+inline void set_input(std::vector<hostbuf>& input,
+ const fft_input_generator igen,
+ const fft_array_type itype,
+ const std::vector<size_t>& length,
+ const std::vector<size_t>& ilength,
+ const std::vector<size_t>& istride,
+ const Tint1& whole_length,
+ const Tint1& whole_stride,
+ const size_t idist,
+ const size_t nbatch,
+ const hipDeviceProp_t& deviceProp)
+{
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ {
+ if(igen == fft_input_generator_host)
+ generate_interleaved_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+ else if(igen == fft_input_random_generator_host)
+ generate_random_interleaved_data<Tfloat>(
+ input, whole_length, whole_stride, idist, nbatch);
+
+ if(itype == fft_array_type_hermitian_interleaved)
+ impose_hermitian_symmetry_interleaved<Tfloat>(input, length, istride, idist, nbatch);
+
+ break;
+ }
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ {
+ if(igen == fft_input_generator_host)
+ generate_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+ else if(igen == fft_input_random_generator_host)
+ generate_random_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+
+ if(itype == fft_array_type_hermitian_planar)
+ impose_hermitian_symmetry_planar<Tfloat>(input, length, istride, idist, nbatch);
+
+ break;
+ }
+ case fft_array_type_real:
+ {
+ if(igen == fft_input_generator_host)
+ generate_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+ else if(igen == fft_input_random_generator_host)
+ generate_random_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+
+ break;
+ }
+ default:
+ throw std::runtime_error("Input layout format not yet supported");
+ }
+}
+
+// unroll set_input for dimension 1, 2, 3
+template <typename Tbuff, typename Tfloat>
+inline void set_input(std::vector<Tbuff>& input,
+ const fft_input_generator igen,
+ const fft_array_type itype,
+ const std::vector<size_t>& length,
+ const std::vector<size_t>& ilength,
+ const std::vector<size_t>& istride,
+ const size_t idist,
+ const size_t nbatch,
+ const hipDeviceProp_t& deviceProp)
+{
+ switch(length.size())
+ {
+ case 1:
+ set_input<Tfloat>(input,
+ igen,
+ itype,
+ length,
+ ilength,
+ istride,
+ ilength[0],
+ istride[0],
+ idist,
+ nbatch,
+ deviceProp);
+ break;
+ case 2:
+ set_input<Tfloat>(input,
+ igen,
+ itype,
+ length,
+ ilength,
+ istride,
+ std::make_tuple(ilength[0], ilength[1]),
+ std::make_tuple(istride[0], istride[1]),
+ idist,
+ nbatch,
+ deviceProp);
+ break;
+ case 3:
+ set_input<Tfloat>(input,
+ igen,
+ itype,
+ length,
+ ilength,
+ istride,
+ std::make_tuple(ilength[0], ilength[1], ilength[2]),
+ std::make_tuple(istride[0], istride[1], istride[2]),
+ idist,
+ nbatch,
+ deviceProp);
+ break;
+ default:
+ abort();
+ }
+}
+
+// Container class for test parameters.
+class fft_params
+{
+public:
+ // All parameters are row-major.
+ std::vector<size_t> length;
+ std::vector<size_t> istride;
+ std::vector<size_t> ostride;
+ size_t nbatch = 1;
+ fft_precision precision = fft_precision_single;
+ fft_input_generator igen = fft_input_random_generator_device;
+ fft_transform_type transform_type = fft_transform_type_complex_forward;
+ fft_result_placement placement = fft_placement_inplace;
+ size_t idist = 0;
+ size_t odist = 0;
+ fft_array_type itype = fft_array_type_unset;
+ fft_array_type otype = fft_array_type_unset;
+ std::vector<size_t> ioffset = {0, 0};
+ std::vector<size_t> ooffset = {0, 0};
+
+ std::vector<size_t> isize;
+ std::vector<size_t> osize;
+
+ size_t workbuffersize = 0;
+
+ struct fft_brick
+ {
+ // all vectors here are row-major, with same length as FFT
+ // dimension + 1 (for batch dimension)
+
+ // inclusive lower bound of brick
+ std::vector<size_t> lower;
+ // exclusive upper bound of brick
+ std::vector<size_t> upper;
+ // stride of brick in memory
+ std::vector<size_t> stride;
+
+ // compute the length of this brick
+ std::vector<size_t> length() const
+ {
+ std::vector<size_t> ret;
+ for(size_t i = 0; i < lower.size(); ++i)
+ ret.push_back(upper[i] - lower[i]);
+ return ret;
+ }
+
+ // compute offset of lower bound in a field with the given
+ // stride + dist (batch stride is separate)
+ size_t lower_field_offset(std::vector<size_t> stride, size_t dist) const
+ {
+ // brick strides include batch, so adjust our input accordingly
+ stride.insert(stride.begin(), dist);
+
+ return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0);
+ }
+
+ // location of the brick
+ int device = 0;
+ };
+
+ struct fft_field
+ {
+ std::vector<fft_brick> bricks;
+ };
+ // optional brick decomposition of inputs/outputs
+ std::vector<fft_field> ifields;
+ std::vector<fft_field> ofields;
+
+ // run testing load/store callbacks
+ bool run_callbacks = false;
+ static constexpr double load_cb_scalar = 0.457813941;
+ static constexpr double store_cb_scalar = 0.391504938;
+
+ // Check that data outside of output strides is not overwritten.
+ // This is only set explicitly on some tests where there's space
+ // between dimensions, but the dimensions are still in-order.
+ // We're not trying to generically find holes in arbitrary data
+ // layouts.
+ //
+ // NOTE: this flag is not included in tokens, since it doesn't
+ // affect how the FFT library behaves.
+ bool check_output_strides = false;
+
+ // scaling factor - we do a pointwise multiplication of outputs by
+ // this factor
+ double scale_factor = 1.0;
+
+ fft_params(){};
+ virtual ~fft_params(){};
+
+ // Given an array type, return the name as a string.
+ static std::string array_type_name(const fft_array_type type, bool verbose = true)
+ {
+ switch(type)
+ {
+ case fft_array_type_complex_interleaved:
+ return verbose ? "fft_array_type_complex_interleaved" : "CI";
+ case fft_array_type_complex_planar:
+ return verbose ? "fft_array_type_complex_planar" : "CP";
+ case fft_array_type_real:
+ return verbose ? "fft_array_type_real" : "R";
+ case fft_array_type_hermitian_interleaved:
+ return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
+ case fft_array_type_hermitian_planar:
+ return verbose ? "fft_array_type_hermitian_planar" : "HP";
+ case fft_array_type_unset:
+ return verbose ? "fft_array_type_unset" : "UN";
+ }
+ return "";
+ }
+
+ std::string transform_type_name() const
+ {
+ switch(transform_type)
+ {
+ case fft_transform_type_complex_forward:
+ return "fft_transform_type_complex_forward";
+ case fft_transform_type_complex_inverse:
+ return "fft_transform_type_complex_inverse";
+ case fft_transform_type_real_forward:
+ return "fft_transform_type_real_forward";
+ case fft_transform_type_real_inverse:
+ return "fft_transform_type_real_inverse";
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+ }
+
+ // Convert to string for output.
+ std::string str(const std::string& separator = ", ") const
+ {
+ // top-level stride/dist are not used when fields are specified.
+ const bool have_ifields = !ifields.empty();
+ const bool have_ofields = !ofields.empty();
+
+ std::stringstream ss;
+ auto print_size_vec = [&](const char* description, const std::vector<size_t>& vec) {
+ ss << description << ":";
+ for(auto i : vec)
+ ss << " " << i;
+ ss << separator;
+ };
+ auto print_fields = [&](const char* description, const std::vector<fft_field>& fields) {
+ for(unsigned int fidx = 0; fidx < fields.size(); ++fidx)
+ {
+ const auto& f = fields[fidx];
+ ss << description << " " << fidx << ":" << separator;
+ for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx)
+ {
+ const auto& b = f.bricks[bidx];
+ ss << " brick " << bidx << ":" << separator;
+ print_size_vec(" lower", b.lower);
+ print_size_vec(" upper", b.upper);
+ print_size_vec(" stride", b.stride);
+ ss << " device: " << b.device << separator;
+ }
+ }
+ };
+
+ print_size_vec("length", length);
+ if(have_ifields)
+ {
+ print_fields("ifield", ifields);
+ }
+ else
+ {
+ print_size_vec("istride", istride);
+ ss << "idist: " << idist << separator;
+ }
+
+ if(have_ofields)
+ {
+ print_fields("ofield", ofields);
+ }
+ else
+ {
+ print_size_vec("ostride", ostride);
+ ss << "odist: " << odist << separator;
+ }
+
+ ss << "batch: " << nbatch << separator;
+ print_size_vec("isize", isize);
+ print_size_vec("osize", osize);
+
+ print_size_vec("ioffset", ioffset);
+ print_size_vec("ooffset", ooffset);
+
+ if(placement == fft_placement_inplace)
+ ss << "in-place";
+ else
+ ss << "out-of-place";
+ ss << separator;
+ ss << "transform_type: " << transform_type_name() << separator;
+ ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
+ switch(precision)
+ {
+ case fft_precision_half:
+ ss << "half-precision";
+ break;
+ case fft_precision_single:
+ ss << "single-precision";
+ break;
+ case fft_precision_double:
+ ss << "double-precision";
+ break;
+ }
+ ss << separator;
+
+ print_size_vec("ilength", ilength());
+ print_size_vec("olength", olength());
+
+ print_size_vec("ibuffer_size", ibuffer_sizes());
+ print_size_vec("obuffer_size", obuffer_sizes());
+
+ if(scale_factor != 1.0)
+ ss << "scale factor: " << scale_factor << separator;
+
+ return ss.str();
+ }
+
+ // Produce a stringified token of the test fft params.
+ std::string token() const
+ {
+ std::string ret;
+
+ switch(transform_type)
+ {
+ case fft_transform_type_complex_forward:
+ ret += "complex_forward_";
+ break;
+ case fft_transform_type_complex_inverse:
+ ret += "complex_inverse_";
+ break;
+ case fft_transform_type_real_forward:
+ ret += "real_forward_";
+ break;
+ case fft_transform_type_real_inverse:
+ ret += "real_inverse_";
+ break;
+ }
+
+ auto append_size_vec = [&ret](const std::vector<size_t>& vec) {
+ for(auto s : vec)
+ {
+ ret += "_";
+ ret += std::to_string(s);
+ }
+ };
+
+ ret += "len";
+ append_size_vec(length);
+
+ switch(precision)
+ {
+ case fft_precision_half:
+ ret += "_half_";
+ break;
+ case fft_precision_single:
+ ret += "_single_";
+ break;
+ case fft_precision_double:
+ ret += "_double_";
+ break;
+ }
+
+ switch(placement)
+ {
+ case fft_placement_inplace:
+ ret += "ip_";
+ break;
+ case fft_placement_notinplace:
+ ret += "op_";
+ break;
+ }
+
+ ret += "batch_";
+ ret += std::to_string(nbatch);
+
+ auto append_array_type = [&ret](fft_array_type type) {
+ switch(type)
+ {
+ case fft_array_type_complex_interleaved:
+ ret += "CI";
+ break;
+ case fft_array_type_complex_planar:
+ ret += "CP";
+ break;
+ case fft_array_type_real:
+ ret += "R";
+ break;
+ case fft_array_type_hermitian_interleaved:
+ ret += "HI";
+ break;
+ case fft_array_type_hermitian_planar:
+ ret += "HP";
+ break;
+ default:
+ ret += "UN";
+ break;
+ }
+ };
+
+ auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) {
+ ret += "_brick";
+
+ ret += "_lower";
+ append_size_vec(b.lower);
+ ret += "_upper";
+ append_size_vec(b.upper);
+ ret += "_stride";
+ append_size_vec(b.stride);
+ ret += "_dev_";
+ ret += std::to_string(b.device);
+ };
+
+ const bool have_ifields = !ifields.empty();
+ const bool have_ofields = !ofields.empty();
+
+ if(have_ifields)
+ {
+ for(const auto& f : ifields)
+ {
+ ret += "_ifield";
+ for(const auto& b : f.bricks)
+ append_brick_info(b);
+ }
+ }
+ else
+ {
+ ret += "_istride";
+ append_size_vec(istride);
+ ret += "_";
+ append_array_type(itype);
+ }
+
+ if(have_ofields)
+ {
+ for(const auto& f : ofields)
+ {
+ ret += "_ofield";
+ for(const auto& b : f.bricks)
+ append_brick_info(b);
+ }
+ }
+ else
+ {
+ ret += "_ostride";
+ append_size_vec(ostride);
+ ret += "_";
+ append_array_type(otype);
+ }
+
+ if(!have_ifields)
+ {
+ ret += "_idist_";
+ ret += std::to_string(idist);
+ }
+ if(!have_ofields)
+ {
+ ret += "_odist_";
+ ret += std::to_string(odist);
+ }
+
+ if(!have_ifields)
+ {
+ ret += "_ioffset";
+ append_size_vec(ioffset);
+ }
+
+ if(!have_ofields)
+ {
+ ret += "_ooffset";
+ append_size_vec(ooffset);
+ }
+
+ if(run_callbacks)
+ ret += "_CB";
+
+ if(scale_factor != 1.0)
+ ret += "_scale";
+
+ return ret;
+ }
+
+ // Set all params from a stringified token.
+ void from_token(std::string token)
+ {
+ std::vector<std::string> vals;
+
+ std::string delimiter = "_";
+ {
+ size_t pos = 0;
+ while((pos = token.find(delimiter)) != std::string::npos)
+ {
+ auto val = token.substr(0, pos);
+ vals.push_back(val);
+ token.erase(0, pos + delimiter.length());
+ }
+ vals.push_back(token);
+ }
+
+ auto size_parser
+ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
+ if(vals[pos++] != token)
+ throw std::runtime_error("Unable to parse token");
+ return std::stoull(vals[pos++]);
+ };
+
+ auto vector_parser
+ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
+ if(vals[pos++] != token)
+ throw std::runtime_error("Unable to parse token");
+ std::vector<size_t> vec;
+
+ while(pos < vals.size())
+ {
+ if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
+ {
+ vec.push_back(std::stoull(vals[pos++]));
+ }
+ else
+ {
+ break;
+ }
+ }
+ return vec;
+ };
+
+ auto type_parser = [](const std::string& val) {
+ if(val == "CI")
+ return fft_array_type_complex_interleaved;
+ else if(val == "CP")
+ return fft_array_type_complex_planar;
+ else if(val == "R")
+ return fft_array_type_real;
+ else if(val == "HI")
+ return fft_array_type_hermitian_interleaved;
+ else if(val == "HP")
+ return fft_array_type_hermitian_planar;
+ return fft_array_type_unset;
+ };
+
+ auto field_parser = [&vector_parser, &size_parser](const std::vector<std::string>& vals,
+ size_t& pos,
+ std::vector<fft_field>& output) {
+ // skip over ifield/ofield word
+ pos++;
+ fft_field& f = output.emplace_back();
+ while(pos < vals.size() && vals[pos] == "brick")
+ {
+ fft_brick& b = f.bricks.emplace_back();
+ pos++;
+ b.lower = vector_parser(vals, "lower", pos);
+ b.upper = vector_parser(vals, "upper", pos);
+ b.stride = vector_parser(vals, "stride", pos);
+ b.device = size_parser(vals, "dev", pos);
+ }
+ };
+
+ size_t pos = 0;
+
+ bool complex = vals[pos++] == "complex";
+ bool forward = vals[pos++] == "forward";
+
+ if(complex && forward)
+ transform_type = fft_transform_type_complex_forward;
+ if(complex && !forward)
+ transform_type = fft_transform_type_complex_inverse;
+ if(!complex && forward)
+ transform_type = fft_transform_type_real_forward;
+ if(!complex && !forward)
+ transform_type = fft_transform_type_real_inverse;
+
+ length = vector_parser(vals, "len", pos);
+
+ if(vals[pos] == "half")
+ precision = fft_precision_half;
+ else if(vals[pos] == "single")
+ precision = fft_precision_single;
+ else if(vals[pos] == "double")
+ precision = fft_precision_double;
+ pos++;
+
+ placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
+
+ nbatch = size_parser(vals, "batch", pos);
+
+ // strides, bricks etc are mixed in from here, so just keep
+ // looking at the next token to decide what to do
+ while(pos < vals.size())
+ {
+ const auto& next_token = vals[pos];
+ if(next_token == "istride")
+ {
+ istride = vector_parser(vals, "istride", pos);
+ itype = type_parser(vals[pos]);
+ pos++;
+ }
+ else if(next_token == "ostride")
+ {
+ ostride = vector_parser(vals, "ostride", pos);
+ otype = type_parser(vals[pos]);
+ pos++;
+ }
+ else if(next_token == "idist")
+ idist = size_parser(vals, "idist", pos);
+ else if(next_token == "odist")
+ odist = size_parser(vals, "odist", pos);
+ else if(next_token == "ioffset")
+ ioffset = vector_parser(vals, "ioffset", pos);
+ else if(next_token == "ooffset")
+ ooffset = vector_parser(vals, "ooffset", pos);
+ else if(next_token == "ifield")
+ field_parser(vals, pos, ifields);
+ else if(next_token == "ofield")
+ field_parser(vals, pos, ofields);
+ else
+ break;
+ }
+
+ if(pos < vals.size() && vals[pos] == "CB")
+ {
+ run_callbacks = true;
+ ++pos;
+ }
+
+ if(pos < vals.size() && vals[pos] == "scale")
+ {
+ // just pick some factor that's not zero or one
+ scale_factor = 0.1239;
+ ++pos;
+ }
+ }
+
+ // Stream output operator (for gtest, etc).
+ friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
+ {
+ stream << params.str();
+ return stream;
+ }
+
+ // Dimension of the transform.
+ size_t dim() const
+ {
+ return length.size();
+ }
+
+ virtual std::vector<size_t> ilength() const
+ {
+ auto ilength = length;
+ if(transform_type == fft_transform_type_real_inverse)
+ ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
+ return ilength;
+ }
+
+ virtual std::vector<size_t> olength() const
+ {
+ auto olength = length;
+ if(transform_type == fft_transform_type_real_forward)
+ olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
+ return olength;
+ }
+
+ static size_t nbuffer(const fft_array_type type)
+ {
+ switch(type)
+ {
+ case fft_array_type_real:
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ return 1;
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ return 2;
+ case fft_array_type_unset:
+ return 0;
+ }
+ return 0;
+ }
+
+ // Number of input buffers
+ size_t nibuffer() const
+ {
+ return nbuffer(itype);
+ }
+
+ // Number of output buffers
+ size_t nobuffer() const
+ {
+ return nbuffer(otype);
+ }
+
+ void set_iotypes()
+ {
+ if(itype == fft_array_type_unset)
+ {
+ switch(transform_type)
+ {
+ case fft_transform_type_complex_forward:
+ case fft_transform_type_complex_inverse:
+ itype = fft_array_type_complex_interleaved;
+ break;
+ case fft_transform_type_real_forward:
+ itype = fft_array_type_real;
+ break;
+ case fft_transform_type_real_inverse:
+ itype = fft_array_type_hermitian_interleaved;
+ break;
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+ }
+ if(otype == fft_array_type_unset)
+ {
+ switch(transform_type)
+ {
+ case fft_transform_type_complex_forward:
+ case fft_transform_type_complex_inverse:
+ otype = fft_array_type_complex_interleaved;
+ break;
+ case fft_transform_type_real_forward:
+ otype = fft_array_type_hermitian_interleaved;
+ break;
+ case fft_transform_type_real_inverse:
+ otype = fft_array_type_real;
+ break;
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+ }
+ }
+
+ // Check that the input and output types are consistent.
+ bool check_iotypes() const
+ {
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_interleaved:
+ case fft_array_type_hermitian_planar:
+ case fft_array_type_real:
+ break;
+ default:
+ throw std::runtime_error("Invalid Input array type format");
+ }
+
+ switch(otype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_interleaved:
+ case fft_array_type_hermitian_planar:
+ case fft_array_type_real:
+ break;
+ default:
+ throw std::runtime_error("Invalid Input array type format");
+ }
+
+ // Check that format choices are supported
+ if(transform_type != fft_transform_type_real_forward
+ && transform_type != fft_transform_type_real_inverse)
+ {
+ if(placement == fft_placement_inplace && itype != otype)
+ {
+ throw std::runtime_error(
+ "In-place transforms must have identical input and output types");
+ }
+ }
+
+ bool okformat = true;
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_complex_planar:
+ okformat = (otype == fft_array_type_complex_interleaved
+ || otype == fft_array_type_complex_planar);
+ break;
+ case fft_array_type_hermitian_interleaved:
+ case fft_array_type_hermitian_planar:
+ okformat = otype == fft_array_type_real;
+ break;
+ case fft_array_type_real:
+ okformat = (otype == fft_array_type_hermitian_interleaved
+ || otype == fft_array_type_hermitian_planar);
+ break;
+ default:
+ throw std::runtime_error("Invalid Input array type format");
+ }
+
+ return okformat;
+ }
+
+ // Given a length vector, set the rest of the strides.
+ // The optional argument stride0 sets the stride for the contiguous dimension.
+ // The optional rcpadding argument sets the stride correctly for in-place
+ // multi-dimensional real/complex transforms.
+ // Format is row-major.
+ template <typename T1>
+ std::vector<T1> compute_stride(const std::vector<T1>& length,
+ const std::vector<size_t>& stride0 = std::vector<size_t>(),
+ const bool rcpadding = false) const
+ {
+ std::vector<T1> stride(dim());
+
+ size_t dimoffset = 0;
+
+ if(stride0.size() == 0)
+ {
+ // Set the contiguous stride:
+ stride[dim() - 1] = 1;
+ dimoffset = 1;
+ }
+ else
+ {
+ // Copy the input values to the end of the stride array:
+ for(size_t i = 0; i < stride0.size(); ++i)
+ {
+ stride[dim() - stride0.size() + i] = stride0[i];
+ }
+ }
+
+ if(stride0.size() < dim())
+ {
+ // Compute any remaining values via recursion.
+ for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
+ {
+ auto lengthip1 = length[i + 1];
+ if(rcpadding && i == dim() - 2)
+ {
+ lengthip1 = 2 * (lengthip1 / 2 + 1);
+ }
+ stride[i] = stride[i + 1] * lengthip1;
+ }
+ }
+
+ return stride;
+ }
+
+ void compute_istride()
+ {
+ istride = compute_stride(ilength(),
+ istride,
+ placement == fft_placement_inplace
+ && transform_type == fft_transform_type_real_forward);
+ }
+
+ void compute_ostride()
+ {
+ ostride = compute_stride(olength(),
+ ostride,
+ placement == fft_placement_inplace
+ && transform_type == fft_transform_type_real_inverse);
+ }
+
+ virtual void compute_isize()
+ {
+ auto il = ilength();
+ size_t val = compute_ptrdiff(il, istride, nbatch, idist);
+ isize.resize(nibuffer());
+ for(unsigned int i = 0; i < isize.size(); ++i)
+ {
+ isize[i] = val + ioffset[i];
+ }
+ }
+
+ virtual void compute_osize()
+ {
+ auto ol = olength();
+ size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
+ osize.resize(nobuffer());
+ for(unsigned int i = 0; i < osize.size(); ++i)
+ {
+ osize[i] = val + ooffset[i];
+ }
+ }
+
+ std::vector<size_t> ibuffer_sizes() const
+ {
+ std::vector<size_t> ibuffer_sizes;
+
+ // In-place real-to-complex transforms need to have enough space in the input buffer to
+ // accomadate the output, which is slightly larger.
+ if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
+ {
+ return obuffer_sizes();
+ }
+
+ if(isize.empty())
+ return ibuffer_sizes;
+
+ switch(itype)
+ {
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ ibuffer_sizes.resize(2);
+ break;
+ default:
+ ibuffer_sizes.resize(1);
+ }
+ for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
+ {
+ ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
+ }
+ return ibuffer_sizes;
+ }
+
+ virtual std::vector<size_t> obuffer_sizes() const
+ {
+ std::vector<size_t> obuffer_sizes;
+
+ if(osize.empty())
+ return obuffer_sizes;
+
+ switch(otype)
+ {
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ obuffer_sizes.resize(2);
+ break;
+ default:
+ obuffer_sizes.resize(1);
+ }
+ for(unsigned i = 0; i < obuffer_sizes.size(); i++)
+ {
+ obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
+ }
+ return obuffer_sizes;
+ }
+
+ // Compute the idist for a given transform based on the placeness, transform type, and data
+ // layout.
+ size_t compute_idist() const
+ {
+ size_t dist = 0;
+ // In-place 1D transforms need extra dist.
+ if(transform_type == fft_transform_type_real_forward && dim() == 1
+ && placement == fft_placement_inplace)
+ {
+ dist = 2 * (length[0] / 2 + 1) * istride[0];
+ return dist;
+ }
+
+ if(transform_type == fft_transform_type_real_inverse && dim() == 1)
+ {
+ dist = (length[0] / 2 + 1) * istride[0];
+ return dist;
+ }
+
+ dist = (transform_type == fft_transform_type_real_inverse)
+ ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
+ : length[dim() - 1] * istride[dim() - 1];
+ for(unsigned int i = 0; i < dim() - 1; ++i)
+ {
+ dist = std::max(length[i] * istride[i], dist);
+ }
+ return dist;
+ }
+ void set_idist()
+ {
+ if(idist != 0)
+ return;
+ idist = compute_idist();
+ }
+
+ // Compute the odist for a given transform based on the placeness, transform type, and data
+ // layout. Row-major.
+ size_t compute_odist() const
+ {
+ size_t dist = 0;
+ // In-place 1D transforms need extra dist.
+ if(transform_type == fft_transform_type_real_inverse && dim() == 1
+ && placement == fft_placement_inplace)
+ {
+ dist = 2 * (length[0] / 2 + 1) * ostride[0];
+ return dist;
+ }
+
+ if(transform_type == fft_transform_type_real_forward && dim() == 1)
+ {
+ dist = (length[0] / 2 + 1) * ostride[0];
+ return dist;
+ }
+
+ dist = (transform_type == fft_transform_type_real_forward)
+ ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
+ : length[dim() - 1] * ostride[dim() - 1];
+ for(unsigned int i = 0; i < dim() - 1; ++i)
+ {
+ dist = std::max(length[i] * ostride[i], dist);
+ }
+ return dist;
+ }
+ void set_odist()
+ {
+ if(odist != 0)
+ return;
+ odist = compute_odist();
+ }
+
+ // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
+ // validity checker.
+ bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
+ const std::vector<size_t>& s0,
+ const size_t n,
+ const size_t dist,
+ const int verbose = 0) const
+ {
+ if(l0.size() != s0.size())
+ return false;
+
+ // Length and stride vectors, including bathes:
+ std::vector<size_t> l{}, s{};
+ for(unsigned int i = 0; i < l0.size(); ++i)
+ {
+ if(l0[i] > 1)
+ {
+ if(s0[i] == 0)
+ return false;
+ l.push_back(l0[i]);
+ s.push_back(s0[i]);
+ }
+ }
+ if(n > 1)
+ {
+ if(dist == 0)
+ return false;
+ l.push_back(n);
+ s.push_back(dist);
+ }
+
+ return array_valid(l, s, verbose);
+ }
+
+ // Return true if the given GPU parameters would produce a valid transform.
+ bool valid(const int verbose) const
+ {
+ if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
+ return false;
+
+ // Check that in-place transforms have the same input and output stride:
+ if(placement == fft_placement_inplace)
+ {
+ const auto stridesize = std::min(istride.size(), ostride.size());
+ bool samestride = true;
+ for(unsigned int i = 0; i < stridesize; ++i)
+ {
+ if(istride[i] != ostride[i])
+ samestride = false;
+ }
+ if((transform_type == fft_transform_type_complex_forward
+ || transform_type == fft_transform_type_complex_inverse)
+ && !samestride)
+ {
+ // In-place transforms require identical input and output strides.
+ if(verbose)
+ {
+ std::cout << "istride:";
+ for(const auto& i : istride)
+ std::cout << " " << i;
+ std::cout << " ostride0:";
+ for(const auto& i : ostride)
+ std::cout << " " << i;
+ std::cout << " differ; skipped for in-place transforms: skipping test"
+ << std::endl;
+ }
+ return false;
+ }
+
+ if((transform_type == fft_transform_type_complex_forward
+ || transform_type == fft_transform_type_complex_inverse)
+ && (idist != odist) && nbatch > 1)
+ {
+ // In-place transforms require identical distance, if
+ // batch > 1. If batch is 1 then dist is ignored and
+ // the FFT should still work.
+ if(verbose)
+ {
+ std::cout << "idist:" << idist << " odist:" << odist
+ << " differ; skipped for in-place transforms: skipping test"
+ << std::endl;
+ }
+ return false;
+ }
+
+ if((transform_type == fft_transform_type_real_forward
+ || transform_type == fft_transform_type_real_inverse)
+ && (istride.back() != 1 || ostride.back() != 1))
+ {
+ // In-place real/complex transforms require unit strides.
+ if(verbose)
+ {
+ std::cout
+ << "istride.back(): " << istride.back()
+ << " ostride.back(): " << ostride.back()
+ << " must be unitary for in-place real/complex transforms: skipping test"
+ << std::endl;
+ }
+ return false;
+ }
+
+ if((itype == fft_array_type_complex_interleaved
+ && otype == fft_array_type_complex_planar)
+ || (itype == fft_array_type_complex_planar
+ && otype == fft_array_type_complex_interleaved))
+ {
+ if(verbose)
+ {
+ std::cout << "In-place c2c transforms require identical io types; skipped.\n";
+ }
+ return false;
+ }
+
+ // Check offsets
+ switch(transform_type)
+ {
+ case fft_transform_type_complex_forward:
+ case fft_transform_type_complex_inverse:
+ for(unsigned int i = 0; i < nibuffer(); ++i)
+ {
+ if(ioffset[i] != ooffset[i])
+ return false;
+ }
+ break;
+ case fft_transform_type_real_forward:
+ if(ioffset[0] != 2 * ooffset[0])
+ return false;
+ break;
+ case fft_transform_type_real_inverse:
+ if(2 * ioffset[0] != ooffset[0])
+ return false;
+ break;
+ }
+ }
+
+ if(!check_iotypes())
+ return false;
+
+ // we can only check output strides on out-of-place
+ // transforms, since we need to initialize output to a known
+ // pattern
+ if(placement == fft_placement_inplace && check_output_strides)
+ return false;
+
+ // Check input and output strides
+ if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
+ {
+ if(verbose)
+ std::cout << "Invalid input data format.\n";
+ return false;
+ }
+ if(!(ilength() == olength() && istride == ostride && idist == odist))
+ {
+ // Only check if different
+ if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
+ {
+ if(verbose)
+ std::cout << "Invalid output data format.\n";
+ return false;
+ }
+ }
+
+ // The parameters are valid.
+ return true;
+ }
+
+ // Fill in any missing parameters.
+ void validate()
+ {
+ set_iotypes();
+ compute_istride();
+ compute_ostride();
+ set_idist();
+ set_odist();
+ compute_isize();
+ compute_osize();
+
+ validate_fields();
+ }
+
+ virtual void validate_fields() const
+ {
+ if(!ifields.empty() || !ofields.empty())
+ throw std::runtime_error("input/output fields are unsupported");
+ }
+
+ // Column-major getters:
+ std::vector<size_t> length_cm() const
+ {
+ auto length_cm = length;
+ std::reverse(std::begin(length_cm), std::end(length_cm));
+ return length_cm;
+ }
+ std::vector<size_t> ilength_cm() const
+ {
+ auto ilength_cm = ilength();
+ std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
+ return ilength_cm;
+ }
+ std::vector<size_t> olength_cm() const
+ {
+ auto olength_cm = olength();
+ std::reverse(std::begin(olength_cm), std::end(olength_cm));
+ return olength_cm;
+ }
+ std::vector<size_t> istride_cm() const
+ {
+ auto istride_cm = istride;
+ std::reverse(std::begin(istride_cm), std::end(istride_cm));
+ return istride_cm;
+ }
+ std::vector<size_t> ostride_cm() const
+ {
+ auto ostride_cm = ostride;
+ std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
+ return ostride_cm;
+ }
+ bool is_planar() const
+ {
+ if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar)
+ return true;
+ if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar)
+ return true;
+ return false;
+ }
+
+ // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary.
+ template <typename Tbuff>
+ inline void compute_input(std::vector<Tbuff>& input)
+ {
+ auto deviceProp = get_curr_device_prop();
+
+ switch(precision)
+ {
+ case fft_precision_half:
+ set_input<Tbuff, _Float16>(
+ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
+ break;
+ case fft_precision_double:
+ set_input<Tbuff, double>(
+ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
+ break;
+ case fft_precision_single:
+ set_input<Tbuff, float>(
+ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
+ break;
+ }
+ }
+
+ template <typename Tstream = std::ostream>
+ void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
+ {
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<rocfft_complex<_Float16>> s;
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<rocfft_complex<float>> s;
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+ break;
+ }
+ case fft_precision_double:
+ {
+ buffer_printer<rocfft_complex<double>> s;
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+ break;
+ }
+ }
+ break;
+ }
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ case fft_array_type_real:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<_Float16> s;
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<float> s;
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+ break;
+ }
+ case fft_precision_double:
+ {
+ buffer_printer<double> s;
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+ break;
+ }
+ }
+ break;
+ }
+ default:
+ throw std::runtime_error("Invalid itype in print_ibuffer");
+ }
+ }
+
+ template <typename Tstream = std::ostream>
+ void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
+ {
+ switch(otype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<rocfft_complex<_Float16>> s;
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<rocfft_complex<float>> s;
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+ break;
+ }
+ case fft_precision_double:
+ buffer_printer<rocfft_complex<double>> s;
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+ break;
+ }
+ break;
+ }
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ case fft_array_type_real:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<_Float16> s;
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<float> s;
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+ break;
+ }
+ case fft_precision_double:
+ {
+ buffer_printer<double> s;
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+ break;
+ }
+ }
+ break;
+ }
+
+ default:
+ throw std::runtime_error("Invalid itype in print_obuffer");
+ }
+ }
+
+ void print_ibuffer_flat(const std::vector<hostbuf>& buf) const
+ {
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<rocfft_complex<_Float16>> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<rocfft_complex<float>> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_double:
+ buffer_printer<rocfft_complex<double>> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ break;
+ }
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ case fft_array_type_real:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<_Float16> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<float> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_double:
+ {
+ buffer_printer<double> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ }
+ break;
+ default:
+ throw std::runtime_error("Invalid itype in print_ibuffer_flat");
+ }
+ }
+ }
+
+ void print_obuffer_flat(const std::vector<hostbuf>& buf) const
+ {
+ switch(otype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<rocfft_complex<_Float16>> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<rocfft_complex<float>> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_double:
+ buffer_printer<rocfft_complex<double>> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ break;
+ }
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ case fft_array_type_real:
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ {
+ buffer_printer<_Float16> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ case fft_precision_single:
+ {
+ buffer_printer<float> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+
+ case fft_precision_double:
+ {
+ buffer_printer<double> s;
+ s.print_buffer_flat(buf, osize, ooffset);
+ break;
+ }
+ }
+ break;
+ default:
+ throw std::runtime_error("Invalid itype in print_ibuffer_flat");
+ }
+ }
+ }
+
+ virtual fft_status set_callbacks(void* load_cb_host,
+ void* load_cb_data,
+ void* store_cb_host,
+ void* store_cb_data)
+ {
+ return fft_status_success;
+ }
+
+ virtual fft_status execute(void** in, void** out)
+ {
+ return fft_status_success;
+ };
+
+ size_t fft_params_vram_footprint()
+ {
+ return fft_params::vram_footprint();
+ }
+
+ virtual size_t vram_footprint()
+ {
+ const auto ibuf_size = ibuffer_sizes();
+ size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
+ if(placement == fft_placement_notinplace)
+ {
+ const auto obuf_size = obuffer_sizes();
+ val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
+ }
+ return val;
+ }
+
+ // Specific exception type for work buffer allocation failure.
+ // Tests that hit this can't fit on the GPU and should be skipped.
+ struct work_buffer_alloc_failure : public std::runtime_error
+ {
+ work_buffer_alloc_failure(const std::string& s)
+ : std::runtime_error(s)
+ {
+ }
+ };
+
+ virtual fft_status create_plan()
+ {
+ return fft_status_success;
+ }
+
+ // Change a forward transform to it's inverse
+ void inverse_from_forward(fft_params& params_forward)
+ {
+ switch(params_forward.transform_type)
+ {
+ case fft_transform_type_complex_forward:
+ transform_type = fft_transform_type_complex_inverse;
+ break;
+ case fft_transform_type_real_forward:
+ transform_type = fft_transform_type_real_inverse;
+ break;
+ default:
+ throw std::runtime_error("Transform type not forward.");
+ }
+
+ length = params_forward.length;
+ istride = params_forward.ostride;
+ ostride = params_forward.istride;
+ nbatch = params_forward.nbatch;
+ precision = params_forward.precision;
+ placement = params_forward.placement;
+ idist = params_forward.odist;
+ odist = params_forward.idist;
+ itype = params_forward.otype;
+ otype = params_forward.itype;
+ ioffset = params_forward.ooffset;
+ ooffset = params_forward.ioffset;
+
+ run_callbacks = params_forward.run_callbacks;
+
+ check_output_strides = params_forward.check_output_strides;
+
+ scale_factor = 1 / params_forward.scale_factor;
+ }
+
+ // prepare for multi-GPU transform. Generated input is in ibuffer.
+ // pibuffer, pobuffer are the pointers that will be passed to the
+ // FFT library's "execute" API.
+ virtual void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
+ std::vector<void*>& pibuffer,
+ std::vector<void*>& pobuffer)
+ {
+ }
+
+ // finalize multi-GPU transform. pobuffers are the pointers
+ // provided to the FFT library's "execute" API. obuffer is the
+ // buffer where transform output needs to go for validation
+ virtual void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) {}
+
+ // create bricks in the specified field for the specified number
+ // of devices. The field is split along the highest FFT
+ // dimension, and the length only includes FFT lengths, not batch
+ // dimension.
+ void distribute_field(int deviceCount,
+ std::vector<fft_field>& fields,
+ const std::vector<size_t>& field_length)
+ {
+ size_t slowLen = field_length.front();
+ if(slowLen < static_cast<size_t>(deviceCount))
+ throw std::runtime_error("too many devices to distribute length "
+ + std::to_string(slowLen));
+
+ auto& field = fields.emplace_back();
+
+ for(int i = 0; i < deviceCount; ++i)
+ {
+ // start at origin
+ std::vector<size_t> field_lower(field_length.size());
+ std::vector<size_t> field_upper(field_length.size());
+
+ // note: slowest FFT dim is index 0 in these coordinates
+ field_lower[0] = slowLen / deviceCount * i;
+
+ // last brick needs to include the whole slow len
+ if(i == deviceCount - 1)
+ {
+ field_upper[0] = slowLen;
+ }
+ else
+ {
+ field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount);
+ }
+
+ for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim)
+ {
+ field_upper[upperDim] = field_length[upperDim];
+ }
+
+ // field coordinates also need to include batch
+ field_lower.insert(field_lower.begin(), 0);
+ field_upper.insert(field_upper.begin(), nbatch);
+
+ // bricks have contiguous strides
+ size_t brick_dist = 1;
+ std::vector<size_t> brick_stride(field_lower.size());
+ for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx)
+ {
+ // fill strides from fastest to slowest
+ *(brick_stride.rbegin() + distIdx) = brick_dist;
+ brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx);
+ }
+ field.bricks.push_back(
+ fft_params::fft_brick{field_lower, field_upper, brick_stride, i});
+ }
+ }
+
+ void distribute_input(int deviceCount)
+ {
+ distribute_field(deviceCount, ifields, length);
+ }
+
+ void distribute_output(int deviceCount)
+ {
+ distribute_field(deviceCount, ofields, olength());
+ }
+};
+
+// This is used with the program_options class so that the user can type an integer on the
+// command line and we store into an enum varaible
+template <typename _Elem, typename _Traits>
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
+ fft_array_type& atype)
+{
+ unsigned tmp;
+ stream >> tmp;
+ atype = fft_array_type(tmp);
+ return stream;
+}
+
+// similarly for transform type
+template <typename _Elem, typename _Traits>
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
+ fft_transform_type& ttype)
+{
+ unsigned tmp;
+ stream >> tmp;
+ ttype = fft_transform_type(tmp);
+ return stream;
+}
+
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
+{
+ return partition_base(length, compute_partition_count(length));
+}
+
+// Partition on the rightmost part of the tuple, for col-major indexing
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
+ partition_colmajor(const std::tuple<T1, T1>& length)
+{
+ auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
+ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
+ for(size_t i = 0; i < partitions.size(); ++i)
+ {
+ std::get<1>(ret[i].first) = partitions[i].first;
+ std::get<0>(ret[i].first) = 0;
+ std::get<1>(ret[i].second) = partitions[i].second;
+ std::get<0>(ret[i].second) = std::get<0>(length);
+ }
+ return ret;
+}
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
+ partition_colmajor(const std::tuple<T1, T1, T1>& length)
+{
+ auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
+ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
+ for(size_t i = 0; i < partitions.size(); ++i)
+ {
+ std::get<2>(ret[i].first) = partitions[i].first;
+ std::get<1>(ret[i].first) = 0;
+ std::get<0>(ret[i].first) = 0;
+ std::get<2>(ret[i].second) = partitions[i].second;
+ std::get<1>(ret[i].second) = std::get<1>(length);
+ std::get<0>(ret[i].second) = std::get<0>(length);
+ }
+ return ret;
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches. The input and output
+// types are identical.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_1to1(const Tval* input,
+ Tval* output,
+ const Tint1& whole_length,
+ const size_t nbatch,
+ const Tint2& istride,
+ const size_t idist,
+ const Tint3& ostride,
+ const size_t odist,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset)
+{
+ const bool idx_equals_odx = istride == ostride && idist == odist;
+ size_t idx_base = 0;
+ size_t odx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+ output[odx + ooffset[0]] = input[idx + ioffset[0]];
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches. The input type is
+// planar and the output type is complex interleaved.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_2to1(const Tval* input0,
+ const Tval* input1,
+ rocfft_complex<Tval>* output,
+ const Tint1& whole_length,
+ const size_t nbatch,
+ const Tint2& istride,
+ const size_t idist,
+ const Tint3& ostride,
+ const size_t odist,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset)
+{
+ const bool idx_equals_odx = istride == ostride && idist == odist;
+ size_t idx_base = 0;
+ size_t odx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+ output[odx + ooffset[0]]
+ = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches. The input type is
+// complex interleaved and the output type is planar.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_1to2(const rocfft_complex<Tval>* input,
+ Tval* output0,
+ Tval* output1,
+ const Tint1& whole_length,
+ const size_t nbatch,
+ const Tint2& istride,
+ const size_t idist,
+ const Tint3& ostride,
+ const size_t odist,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset)
+{
+ const bool idx_equals_odx = istride == ostride && idist == odist;
+ size_t idx_base = 0;
+ size_t odx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+ output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
+ output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
+ } while(increment_rowmajor(index, length));
+ }
+ }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches. The input type given
+// by itype, and the output type is given by otype.
+template <typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers(const std::vector<hostbuf>& input,
+ std::vector<hostbuf>& output,
+ const Tint1& length,
+ const size_t nbatch,
+ const fft_precision precision,
+ const fft_array_type itype,
+ const Tint2& istride,
+ const size_t idist,
+ const fft_array_type otype,
+ const Tint3& ostride,
+ const size_t odist,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset)
+{
+ if(itype == otype)
+ {
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ switch(precision)
+ {
+ case fft_precision_half:
+ copy_buffers_1to1(
+ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_single:
+ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+ reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_double:
+ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+ reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ }
+ break;
+ case fft_array_type_real:
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ for(unsigned int idx = 0; idx < input.size(); ++idx)
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()),
+ reinterpret_cast<_Float16*>(output[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_single:
+ copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
+ reinterpret_cast<float*>(output[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_double:
+ copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
+ reinterpret_cast<double*>(output[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ }
+ }
+ break;
+ default:
+ throw std::runtime_error("Invalid data type");
+ }
+ }
+ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
+ || (itype == fft_array_type_hermitian_interleaved
+ && otype == fft_array_type_hermitian_planar))
+ {
+ // copy 1to2
+ switch(precision)
+ {
+ case fft_precision_half:
+ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+ reinterpret_cast<_Float16*>(output[0].data()),
+ reinterpret_cast<_Float16*>(output[1].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_single:
+ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+ reinterpret_cast<float*>(output[0].data()),
+ reinterpret_cast<float*>(output[1].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_double:
+ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+ reinterpret_cast<double*>(output[0].data()),
+ reinterpret_cast<double*>(output[1].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ }
+ }
+ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
+ || (itype == fft_array_type_hermitian_planar
+ && otype == fft_array_type_hermitian_interleaved))
+ {
+ // copy 2 to 1
+ switch(precision)
+ {
+ case fft_precision_half:
+ copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()),
+ reinterpret_cast<const _Float16*>(input[1].data()),
+ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_single:
+ copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
+ reinterpret_cast<const float*>(input[1].data()),
+ reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ case fft_precision_double:
+ copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
+ reinterpret_cast<const double*>(input[1].data()),
+ reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ ioffset,
+ ooffset);
+ break;
+ }
+ }
+ else
+ {
+ throw std::runtime_error("Invalid input and output types.");
+ }
+}
+
+// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
+template <typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers(const std::vector<hostbuf>& input,
+ std::vector<hostbuf>& output,
+ const std::vector<Tint1>& length,
+ const size_t nbatch,
+ const fft_precision precision,
+ const fft_array_type itype,
+ const std::vector<Tint2>& istride,
+ const size_t idist,
+ const fft_array_type otype,
+ const std::vector<Tint3>& ostride,
+ const size_t odist,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset)
+{
+ switch(length.size())
+ {
+ case 1:
+ return copy_buffers(input,
+ output,
+ length[0],
+ nbatch,
+ precision,
+ itype,
+ istride[0],
+ idist,
+ otype,
+ ostride[0],
+ odist,
+ ioffset,
+ ooffset);
+ case 2:
+ return copy_buffers(input,
+ output,
+ std::make_tuple(length[0], length[1]),
+ nbatch,
+ precision,
+ itype,
+ std::make_tuple(istride[0], istride[1]),
+ idist,
+ otype,
+ std::make_tuple(ostride[0], ostride[1]),
+ odist,
+ ioffset,
+ ooffset);
+ case 3:
+ return copy_buffers(input,
+ output,
+ std::make_tuple(length[0], length[1], length[2]),
+ nbatch,
+ precision,
+ itype,
+ std::make_tuple(istride[0], istride[1], istride[2]),
+ idist,
+ otype,
+ std::make_tuple(ostride[0], ostride[1], ostride[2]),
+ odist,
+ ioffset,
+ ooffset);
+ default:
+ abort();
+ }
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches. Both buffers are of complex type.
+
+struct VectorNorms
+{
+ double l_2 = 0.0, l_inf = 0.0;
+};
+
+template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance_1to1_complex(const Tcomplex* input,
+ const Tcomplex* output,
+ const Tint1& whole_length,
+ const size_t nbatch,
+ const Tint2& istride,
+ const size_t idist,
+ const Tint3& ostride,
+ const size_t odist,
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
+ const double linf_cutoff,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset,
+ const double output_scalar = 1.0)
+{
+ double linf = 0.0;
+ double l2 = 0.0;
+
+ std::mutex linf_failure_lock;
+ std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+ const bool idx_equals_odx = istride == ostride && idist == odist;
+ size_t idx_base = 0;
+ size_t odx_base = 0;
+ auto partitions = partition_colmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ double cur_linf = 0.0;
+ double cur_l2 = 0.0;
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+ const double rdiff
+ = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar
+ - static_cast<double>(input[idx + ioffset[0]].real()));
+ cur_linf = std::max(rdiff, cur_linf);
+ if(cur_linf > linf_cutoff)
+ {
+ std::pair<size_t, size_t> fval(b, idx);
+ if(linf_failures)
+ linf_failures_private.push_back(fval);
+ }
+ cur_l2 += rdiff * rdiff;
+
+ const double idiff
+ = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar
+ - static_cast<double>(input[idx + ioffset[0]].imag()));
+ cur_linf = std::max(idiff, cur_linf);
+ if(cur_linf > linf_cutoff)
+ {
+ std::pair<size_t, size_t> fval(b, idx);
+ if(linf_failures)
+ linf_failures_private.push_back(fval);
+ }
+ cur_l2 += idiff * idiff;
+
+ } while(increment_rowmajor(index, length));
+ linf = std::max(linf, cur_linf);
+ l2 += cur_l2;
+
+ if(linf_failures)
+ {
+ linf_failure_lock.lock();
+ std::copy(linf_failures_private.begin(),
+ linf_failures_private.end(),
+ std::back_inserter(*linf_failures));
+ linf_failure_lock.unlock();
+ }
+ }
+ }
+ return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches. Both buffers are of real type.
+template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance_1to1_real(const Tfloat* input,
+ const Tfloat* output,
+ const Tint1& whole_length,
+ const size_t nbatch,
+ const Tint2& istride,
+ const size_t idist,
+ const Tint3& ostride,
+ const size_t odist,
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
+ const double linf_cutoff,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset,
+ const double output_scalar = 1.0)
+{
+ double linf = 0.0;
+ double l2 = 0.0;
+
+ std::mutex linf_failure_lock;
+ std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+ const bool idx_equals_odx = istride == ostride && idist == odist;
+ size_t idx_base = 0;
+ size_t odx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ double cur_linf = 0.0;
+ double cur_l2 = 0.0;
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+ const double diff
+ = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar
+ - static_cast<double>(input[idx + ioffset[0]]));
+ cur_linf = std::max(diff, cur_linf);
+ if(cur_linf > linf_cutoff)
+ {
+ std::pair<size_t, size_t> fval(b, idx);
+ if(linf_failures)
+ linf_failures_private.push_back(fval);
+ }
+ cur_l2 += diff * diff;
+
+ } while(increment_rowmajor(index, length));
+ linf = std::max(linf, cur_linf);
+ l2 += cur_l2;
+
+ if(linf_failures)
+ {
+ linf_failure_lock.lock();
+ std::copy(linf_failures_private.begin(),
+ linf_failures_private.end(),
+ std::back_inserter(*linf_failures));
+ linf_failure_lock.unlock();
+ }
+ }
+ }
+ return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches. input is complex-interleaved, output is complex-planar.
+template <typename Tval, typename Tint1, typename T2, typename T3>
+inline VectorNorms distance_1to2(const rocfft_complex<Tval>* input,
+ const Tval* output0,
+ const Tval* output1,
+ const Tint1& whole_length,
+ const size_t nbatch,
+ const T2& istride,
+ const size_t idist,
+ const T3& ostride,
+ const size_t odist,
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
+ const double linf_cutoff,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset,
+ const double output_scalar = 1.0)
+{
+ double linf = 0.0;
+ double l2 = 0.0;
+
+ std::mutex linf_failure_lock;
+ std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+ const bool idx_equals_odx = istride == ostride && idist == odist;
+ size_t idx_base = 0;
+ size_t odx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ double cur_linf = 0.0;
+ double cur_l2 = 0.0;
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+ const double rdiff
+ = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar
+ - static_cast<double>(input[idx + ioffset[0]].real()));
+ cur_linf = std::max(rdiff, cur_linf);
+ if(cur_linf > linf_cutoff)
+ {
+ std::pair<size_t, size_t> fval(b, idx);
+ if(linf_failures)
+ linf_failures_private.push_back(fval);
+ }
+ cur_l2 += rdiff * rdiff;
+
+ const double idiff
+ = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar
+ - static_cast<double>(input[idx + ioffset[0]].imag()));
+ cur_linf = std::max(idiff, cur_linf);
+ if(cur_linf > linf_cutoff)
+ {
+ std::pair<size_t, size_t> fval(b, idx);
+ if(linf_failures)
+ linf_failures_private.push_back(fval);
+ }
+ cur_l2 += idiff * idiff;
+
+ } while(increment_rowmajor(index, length));
+ linf = std::max(linf, cur_linf);
+ l2 += cur_l2;
+
+ if(linf_failures)
+ {
+ linf_failure_lock.lock();
+ std::copy(linf_failures_private.begin(),
+ linf_failures_private.end(),
+ std::back_inserter(*linf_failures));
+ linf_failure_lock.unlock();
+ }
+ }
+ }
+ return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
+// with types given by itype, otype, and precision.
+template <typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance(const std::vector<hostbuf>& input,
+ const std::vector<hostbuf>& output,
+ const Tint1& length,
+ const size_t nbatch,
+ const fft_precision precision,
+ const fft_array_type itype,
+ const Tint2& istride,
+ const size_t idist,
+ const fft_array_type otype,
+ const Tint3& ostride,
+ const size_t odist,
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
+ const double linf_cutoff,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset,
+ const double output_scalar = 1.0)
+{
+ VectorNorms dist;
+
+ if(itype == otype)
+ {
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ switch(precision)
+ {
+ case fft_precision_half:
+ dist = distance_1to1_complex(
+ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+ reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_single:
+ dist = distance_1to1_complex(
+ reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+ reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_double:
+ dist = distance_1to1_complex(
+ reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+ reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ }
+ dist.l_2 *= dist.l_2;
+ break;
+ case fft_array_type_real:
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ for(unsigned int idx = 0; idx < input.size(); ++idx)
+ {
+ VectorNorms d;
+ switch(precision)
+ {
+ case fft_precision_half:
+ d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()),
+ reinterpret_cast<const _Float16*>(output[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_single:
+ d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
+ reinterpret_cast<const float*>(output[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_double:
+ d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
+ reinterpret_cast<const double*>(output[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ }
+ dist.l_inf = std::max(d.l_inf, dist.l_inf);
+ dist.l_2 += d.l_2 * d.l_2;
+ }
+ break;
+ default:
+ throw std::runtime_error("Invalid input and output types.");
+ }
+ }
+ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
+ || (itype == fft_array_type_hermitian_interleaved
+ && otype == fft_array_type_hermitian_planar))
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+ reinterpret_cast<const _Float16*>(output[0].data()),
+ reinterpret_cast<const _Float16*>(output[1].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_single:
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+ reinterpret_cast<const float*>(output[0].data()),
+ reinterpret_cast<const float*>(output[1].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_double:
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+ reinterpret_cast<const double*>(output[0].data()),
+ reinterpret_cast<const double*>(output[1].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ ostride,
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ }
+ dist.l_2 *= dist.l_2;
+ }
+ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
+ || (itype == fft_array_type_hermitian_planar
+ && otype == fft_array_type_hermitian_interleaved))
+ {
+ switch(precision)
+ {
+ case fft_precision_half:
+ dist
+ = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
+ reinterpret_cast<const _Float16*>(input[0].data()),
+ reinterpret_cast<const _Float16*>(input[1].data()),
+ length,
+ nbatch,
+ ostride,
+ odist,
+ istride,
+ idist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_single:
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
+ reinterpret_cast<const float*>(input[0].data()),
+ reinterpret_cast<const float*>(input[1].data()),
+ length,
+ nbatch,
+ ostride,
+ odist,
+ istride,
+ idist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ case fft_precision_double:
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
+ reinterpret_cast<const double*>(input[0].data()),
+ reinterpret_cast<const double*>(input[1].data()),
+ length,
+ nbatch,
+ ostride,
+ odist,
+ istride,
+ idist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ break;
+ }
+ dist.l_2 *= dist.l_2;
+ }
+ else
+ {
+ throw std::runtime_error("Invalid input and output types.");
+ }
+ dist.l_2 = sqrt(dist.l_2);
+ return dist;
+}
+
+// check if the specified length + stride/dist is contiguous
+template <typename Tint1, typename Tint2>
+bool is_contiguous_rowmajor(const std::vector<Tint1>& length,
+ const std::vector<Tint2>& stride,
+ size_t dist)
+{
+ size_t expected_stride = 1;
+ auto stride_it = stride.rbegin();
+ auto length_it = length.rbegin();
+ for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it)
+ {
+ if(*stride_it != expected_stride)
+ return false;
+ expected_stride *= *length_it;
+ }
+ return expected_stride == dist;
+}
+
+// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
+template <typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance(const std::vector<hostbuf>& input,
+ const std::vector<hostbuf>& output,
+ std::vector<Tint1> length,
+ size_t nbatch,
+ const fft_precision precision,
+ const fft_array_type itype,
+ std::vector<Tint2> istride,
+ const size_t idist,
+ const fft_array_type otype,
+ std::vector<Tint3> ostride,
+ const size_t odist,
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
+ const double linf_cutoff,
+ const std::vector<size_t>& ioffset,
+ const std::vector<size_t>& ooffset,
+ const double output_scalar = 1.0)
+{
+ // If istride and ostride are both contiguous, collapse them down
+ // to one dimension. Index calculation is simpler (and faster)
+ // in the 1D case.
+ if(is_contiguous_rowmajor(length, istride, idist)
+ && is_contiguous_rowmajor(length, ostride, odist))
+ {
+ length = {product(length.begin(), length.end()) * nbatch};
+ istride = {static_cast<Tint2>(1)};
+ ostride = {static_cast<Tint3>(1)};
+ nbatch = 1;
+ }
+
+ switch(length.size())
+ {
+ case 1:
+ return distance(input,
+ output,
+ length[0],
+ nbatch,
+ precision,
+ itype,
+ istride[0],
+ idist,
+ otype,
+ ostride[0],
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ case 2:
+ return distance(input,
+ output,
+ std::make_tuple(length[0], length[1]),
+ nbatch,
+ precision,
+ itype,
+ std::make_tuple(istride[0], istride[1]),
+ idist,
+ otype,
+ std::make_tuple(ostride[0], ostride[1]),
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ case 3:
+ return distance(input,
+ output,
+ std::make_tuple(length[0], length[1], length[2]),
+ nbatch,
+ precision,
+ itype,
+ std::make_tuple(istride[0], istride[1], istride[2]),
+ idist,
+ otype,
+ std::make_tuple(ostride[0], ostride[1], ostride[2]),
+ odist,
+ linf_failures,
+ linf_cutoff,
+ ioffset,
+ ooffset,
+ output_scalar);
+ default:
+ abort();
+ }
+}
+
+// Compute the L-infinity and L-2 norm of a buffer with strides istride and
+// length idist. Data is rocfft_complex.
+template <typename Tcomplex, typename T1, typename T2>
+inline VectorNorms norm_complex(const Tcomplex* input,
+ const T1& whole_length,
+ const size_t nbatch,
+ const T2& istride,
+ const size_t idist,
+ const std::vector<size_t>& offset)
+{
+ double linf = 0.0;
+ double l2 = 0.0;
+
+ size_t idx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ double cur_linf = 0.0;
+ double cur_l2 = 0.0;
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+
+ const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real()));
+ cur_linf = std::max(rval, cur_linf);
+ cur_l2 += rval * rval;
+
+ const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag()));
+ cur_linf = std::max(ival, cur_linf);
+ cur_l2 += ival * ival;
+
+ } while(increment_rowmajor(index, length));
+ linf = std::max(linf, cur_linf);
+ l2 += cur_l2;
+ }
+ }
+ return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
+// length idist. Data is real-valued.
+template <typename Tfloat, typename T1, typename T2>
+inline VectorNorms norm_real(const Tfloat* input,
+ const T1& whole_length,
+ const size_t nbatch,
+ const T2& istride,
+ const size_t idist,
+ const std::vector<size_t>& offset)
+{
+ double linf = 0.0;
+ double l2 = 0.0;
+
+ size_t idx_base = 0;
+ auto partitions = partition_rowmajor(whole_length);
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist)
+ {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
+#endif
+ for(size_t part = 0; part < partitions.size(); ++part)
+ {
+ double cur_linf = 0.0;
+ double cur_l2 = 0.0;
+ auto index = partitions[part].first;
+ const auto length = partitions[part].second;
+ do
+ {
+ const auto idx = compute_index(index, istride, idx_base);
+ const double val = std::abs(static_cast<double>(input[idx + offset[0]]));
+ cur_linf = std::max(val, cur_linf);
+ cur_l2 += val * val;
+
+ } while(increment_rowmajor(index, length));
+ linf = std::max(linf, cur_linf);
+ l2 += cur_l2;
+ }
+ }
+ return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
+// length idist. Data format is given by precision and itype.
+template <typename T1, typename T2>
+inline VectorNorms norm(const std::vector<hostbuf>& input,
+ const T1& length,
+ const size_t nbatch,
+ const fft_precision precision,
+ const fft_array_type itype,
+ const T2& istride,
+ const size_t idist,
+ const std::vector<size_t>& offset)
+{
+ VectorNorms norm;
+
+ switch(itype)
+ {
+ case fft_array_type_complex_interleaved:
+ case fft_array_type_hermitian_interleaved:
+ switch(precision)
+ {
+ case fft_precision_half:
+ norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ offset);
+ break;
+ case fft_precision_single:
+ norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ offset);
+ break;
+ case fft_precision_double:
+ norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ offset);
+ break;
+ }
+ norm.l_2 *= norm.l_2;
+ break;
+ case fft_array_type_real:
+ case fft_array_type_complex_planar:
+ case fft_array_type_hermitian_planar:
+ for(unsigned int idx = 0; idx < input.size(); ++idx)
+ {
+ VectorNorms n;
+ switch(precision)
+ {
+ case fft_precision_half:
+ n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ offset);
+ break;
+ case fft_precision_single:
+ n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ offset);
+ break;
+ case fft_precision_double:
+ n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
+ length,
+ nbatch,
+ istride,
+ idist,
+ offset);
+ break;
+ }
+ norm.l_inf = std::max(n.l_inf, norm.l_inf);
+ norm.l_2 += n.l_2 * n.l_2;
+ }
+ break;
+ default:
+ throw std::runtime_error("Invalid data type");
+ }
+
+ norm.l_2 = sqrt(norm.l_2);
+ return norm;
+}
+
+// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
+template <typename T1, typename T2>
+inline VectorNorms norm(const std::vector<hostbuf>& input,
+ std::vector<T1> length,
+ size_t nbatch,
+ const fft_precision precision,
+ const fft_array_type type,
+ std::vector<T2> stride,
+ const size_t dist,
+ const std::vector<size_t>& offset)
+{
+ // If stride is contiguous, collapse it down to one dimension.
+ // Index calculation is simpler (and faster) in the 1D case.
+ if(is_contiguous_rowmajor(length, stride, dist))
+ {
+ length = {product(length.begin(), length.end()) * nbatch};
+ stride = {static_cast<T2>(1)};
+ nbatch = 1;
+ }
+
+ switch(length.size())
+ {
+ case 1:
+ return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
+ case 2:
+ return norm(input,
+ std::make_tuple(length[0], length[1]),
+ nbatch,
+ precision,
+ type,
+ std::make_tuple(stride[0], stride[1]),
+ dist,
+ offset);
+ case 3:
+ return norm(input,
+ std::make_tuple(length[0], length[1], length[2]),
+ nbatch,
+ precision,
+ type,
+ std::make_tuple(stride[0], stride[1], stride[2]),
+ dist,
+ offset);
+ default:
+ abort();
+ }
+}
+
+// Given a data type and precision, the distance between batches, and
+// the batch size, allocate the required host buffer(s).
+static std::vector<hostbuf> allocate_host_buffer(const fft_precision precision,
+ const fft_array_type type,
+ const std::vector<size_t>& size)
+{
+ std::vector<hostbuf> buffers(size.size());
+ for(unsigned int i = 0; i < size.size(); ++i)
+ {
+ buffers[i].alloc(size[i] * var_size<size_t>(precision, type));
+ }
+ return buffers;
+}
+
+// Check if the required buffers fit in the device vram.
+inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0)
+{
+ // We keep a small margin of error for fitting the problem into vram:
+ const size_t extra = 1 << 27;
+
+ return vram_avail > prob_size + extra;
+}
+
+// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
+// This function will return 0 for the other transform types, since
+// the VRAM footprint in rocFFT is negligible for the other cases.
+inline size_t twiddle_table_vram_footprint(const fft_params& params)
+{
+ size_t vram_footprint = 0;
+
+ // Add vram footprint from real/complex even twiddle buffer size.
+ if(params.transform_type == fft_transform_type_real_forward
+ || params.transform_type == fft_transform_type_real_inverse)
+ {
+ const auto realdim = params.length.back();
+ if(realdim % 2 == 0)
+ {
+ const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
+ // even length twiddle size is 1/4 of the real size, but
+ // in complex elements
+ vram_footprint += realdim * complex_size / 4;
+ }
+ }
+
+ return vram_footprint;
+}
+
+#endif
diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h
new file mode 100644
index 0000000..873a373
--- /dev/null
+++ b/shared/fftw_transform.h
@@ -0,0 +1,493 @@
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+#ifndef FFTWTRANSFORM_H
+#define FFTWTRANSFORM_H
+
+#include "hostbuf.h"
+#include "rocfft_complex.h"
+#include "test_params.h"
+#include <fftw3.h>
+#include <vector>
+
+// Function to return maximum error for float and double types.
+//
+// Following Schatzman (1996; Accuracy of the Discrete Fourier
+// Transform and the Fast Fourier Transform), the shape of relative
+// l_2 error vs length should look like
+//
+// epsilon * sqrt(log2(length)).
+//
+// The magic epsilon constants below were chosen so that we get a
+// reasonable upper bound for (all of) our tests.
+//
+// For rocFFT, prime lengths result in the highest error. As such,
+// the epsilons below are perhaps too loose for pow2 lengths; but they
+// are appropriate for prime lengths.
+template <typename Tfloat>
+inline double type_epsilon();
+template <>
+inline double type_epsilon<_Float16>()
+{
+ return half_epsilon;
+}
+template <>
+inline double type_epsilon<float>()
+{
+ return single_epsilon;
+}
+template <>
+inline double type_epsilon<double>()
+{
+ return double_epsilon;
+}
+
+// C++ traits to translate float->fftwf_complex and
+// double->fftw_complex.
+// The correct FFTW complex type can be accessed via, for example,
+// using complex_t = typename fftw_complex_trait<Tfloat>::complex_t;
+template <typename Tfloat>
+struct fftw_trait;
+template <>
+struct fftw_trait<_Float16>
+{
+ // fftw does not support half precision, so use single precision and convert
+ using fftw_complex_type = fftwf_complex;
+ using fftw_plan_type = fftwf_plan;
+};
+template <>
+struct fftw_trait<float>
+{
+ using fftw_complex_type = fftwf_complex;
+ using fftw_plan_type = fftwf_plan;
+};
+template <>
+struct fftw_trait<double>
+{
+ using fftw_complex_type = fftw_complex;
+ using fftw_plan_type = fftw_plan;
+};
+
+// Copies the half-precision input buffer to a single-precision
+// buffer. Note that the input buffer is already sized like it's a
+// single-precision buffer (but only half of it is filled), because
+// we allocate a single-precision buffer for FFTW to plan with.
+static hostbuf half_to_single_copy(const hostbuf& in)
+{
+ auto out = in.copy();
+ auto in_begin = reinterpret_cast<const _Float16*>(in.data());
+ std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data()));
+ return out;
+}
+
+// converts a wider precision buffer to a narrower precision, in-place
+template <typename TfloatIn, typename TfloatOut>
+void narrow_precision_inplace(hostbuf& in)
+{
+ // ensure we're actually shrinking the data
+ static_assert(sizeof(TfloatIn) > sizeof(TfloatOut));
+
+ auto readPtr = reinterpret_cast<const TfloatIn*>(in.data());
+ auto writePtr = reinterpret_cast<TfloatOut*>(in.data());
+ std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr);
+ in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut)));
+}
+
+static void single_to_half_inplace(hostbuf& in)
+{
+ narrow_precision_inplace<float, _Float16>(in);
+}
+
+// Template wrappers for real-valued FFTW allocators:
+template <typename Tfloat>
+inline Tfloat* fftw_alloc_real_type(size_t n);
+template <>
+inline float* fftw_alloc_real_type<float>(size_t n)
+{
+ return fftwf_alloc_real(n);
+}
+template <>
+inline double* fftw_alloc_real_type<double>(size_t n)
+{
+ return fftw_alloc_real(n);
+}
+
+// Template wrappers for complex-valued FFTW allocators:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_complex_type* fftw_alloc_complex_type(size_t n);
+template <>
+inline typename fftw_trait<float>::fftw_complex_type* fftw_alloc_complex_type<float>(size_t n)
+{
+ return fftwf_alloc_complex(n);
+}
+template <>
+inline typename fftw_trait<double>::fftw_complex_type* fftw_alloc_complex_type<double>(size_t n)
+{
+ return fftw_alloc_complex(n);
+}
+
+template <typename fftw_type>
+inline fftw_type* fftw_alloc_type(size_t n);
+template <>
+inline float* fftw_alloc_type<float>(size_t n)
+{
+ return fftw_alloc_real_type<float>(n);
+}
+template <>
+inline double* fftw_alloc_type<double>(size_t n)
+{
+ return fftw_alloc_real_type<double>(n);
+}
+template <>
+inline fftwf_complex* fftw_alloc_type<fftwf_complex>(size_t n)
+{
+ return fftw_alloc_complex_type<float>(n);
+}
+template <>
+inline fftw_complex* fftw_alloc_type<fftw_complex>(size_t n)
+{
+ return fftw_alloc_complex_type<double>(n);
+}
+template <>
+inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n)
+{
+ return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n);
+}
+template <>
+inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n)
+{
+ return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n);
+}
+
+// Template wrappers for FFTW plan executors:
+template <typename Tfloat>
+inline void fftw_execute_type(typename fftw_trait<Tfloat>::fftw_plan_type plan);
+template <>
+inline void fftw_execute_type<float>(typename fftw_trait<float>::fftw_plan_type plan)
+{
+ return fftwf_execute(plan);
+}
+template <>
+inline void fftw_execute_type<double>(typename fftw_trait<double>::fftw_plan_type plan)
+{
+ return fftw_execute(plan);
+}
+
+// Template wrappers for FFTW plan destroyers:
+template <typename Tfftw_plan>
+inline void fftw_destroy_plan_type(Tfftw_plan plan);
+template <>
+inline void fftw_destroy_plan_type<fftwf_plan>(fftwf_plan plan)
+{
+ return fftwf_destroy_plan(plan);
+}
+template <>
+inline void fftw_destroy_plan_type<fftw_plan>(fftw_plan plan)
+{
+ return fftw_destroy_plan(plan);
+}
+
+// Template wrappers for FFTW c2c planners:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_plan_type
+ fftw_plan_guru64_dft(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<Tfloat>::fftw_complex_type* in,
+ typename fftw_trait<Tfloat>::fftw_complex_type* out,
+ int sign,
+ unsigned flags);
+
+template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+ fftw_plan_guru64_dft<_Float16>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<_Float16>::fftw_complex_type* in,
+ typename fftw_trait<_Float16>::fftw_complex_type* out,
+ int sign,
+ unsigned flags)
+{
+ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+template <>
+inline typename fftw_trait<float>::fftw_plan_type
+ fftw_plan_guru64_dft<float>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<float>::fftw_complex_type* in,
+ typename fftw_trait<float>::fftw_complex_type* out,
+ int sign,
+ unsigned flags)
+{
+ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+template <>
+inline typename fftw_trait<double>::fftw_plan_type
+ fftw_plan_guru64_dft<double>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<double>::fftw_complex_type* in,
+ typename fftw_trait<double>::fftw_complex_type* out,
+ int sign,
+ unsigned flags)
+{
+ return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+// Template wrappers for FFTW c2c executors:
+template <typename Tfloat>
+inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out);
+
+template <>
+inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ // since FFTW does not natively support half precision, convert
+ // input to single, execute, then convert output back to half
+ auto in_single = half_to_single_copy(in.front());
+ fftwf_execute_dft(plan,
+ reinterpret_cast<fftwf_complex*>(in_single.data()),
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
+ single_to_half_inplace(out.front());
+}
+
+template <>
+inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ fftwf_execute_dft(plan,
+ reinterpret_cast<fftwf_complex*>(in.front().data()),
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
+}
+
+template <>
+inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ fftw_execute_dft(plan,
+ reinterpret_cast<fftw_complex*>(in.front().data()),
+ reinterpret_cast<fftw_complex*>(out.front().data()));
+}
+
+// Template wrappers for FFTW r2c planners:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_plan_type
+ fftw_plan_guru64_r2c(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ Tfloat* in,
+ typename fftw_trait<Tfloat>::fftw_complex_type* out,
+ unsigned flags);
+template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+ fftw_plan_guru64_r2c<_Float16>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ _Float16* in,
+ typename fftw_trait<_Float16>::fftw_complex_type* out,
+ unsigned flags)
+{
+ return fftwf_plan_guru64_dft_r2c(
+ rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags);
+}
+template <>
+inline typename fftw_trait<float>::fftw_plan_type
+ fftw_plan_guru64_r2c<float>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ float* in,
+ typename fftw_trait<float>::fftw_complex_type* out,
+ unsigned flags)
+{
+ return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+template <>
+inline typename fftw_trait<double>::fftw_plan_type
+ fftw_plan_guru64_r2c<double>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ double* in,
+ typename fftw_trait<double>::fftw_complex_type* out,
+ unsigned flags)
+{
+ return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+
+// Template wrappers for FFTW r2c executors:
+template <typename Tfloat>
+inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out);
+template <>
+inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ // since FFTW does not natively support half precision, convert
+ // input to single, execute, then convert output back to half
+ auto in_single = half_to_single_copy(in.front());
+ fftwf_execute_dft_r2c(plan,
+ reinterpret_cast<float*>(in_single.data()),
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
+ single_to_half_inplace(out.front());
+}
+template <>
+inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ fftwf_execute_dft_r2c(plan,
+ reinterpret_cast<float*>(in.front().data()),
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
+}
+template <>
+inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ fftw_execute_dft_r2c(plan,
+ reinterpret_cast<double*>(in.front().data()),
+ reinterpret_cast<fftw_complex*>(out.front().data()));
+}
+
+// Template wrappers for FFTW c2r planners:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_plan_type
+ fftw_plan_guru64_c2r(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<Tfloat>::fftw_complex_type* in,
+ Tfloat* out,
+ unsigned flags);
+template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+ fftw_plan_guru64_c2r<_Float16>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<_Float16>::fftw_complex_type* in,
+ _Float16* out,
+ unsigned flags)
+{
+ return fftwf_plan_guru64_dft_c2r(
+ rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags);
+}
+template <>
+inline typename fftw_trait<float>::fftw_plan_type
+ fftw_plan_guru64_c2r<float>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<float>::fftw_complex_type* in,
+ float* out,
+ unsigned flags)
+{
+ return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+template <>
+inline typename fftw_trait<double>::fftw_plan_type
+ fftw_plan_guru64_c2r<double>(int rank,
+ const fftw_iodim64* dims,
+ int howmany_rank,
+ const fftw_iodim64* howmany_dims,
+ typename fftw_trait<double>::fftw_complex_type* in,
+ double* out,
+ unsigned flags)
+{
+ return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+
+// Template wrappers for FFTW c2r executors:
+template <typename Tfloat>
+inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out);
+template <>
+inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ // since FFTW does not natively support half precision, convert
+ // input to single, execute, then convert output back to half
+ auto in_single = half_to_single_copy(in.front());
+ fftwf_execute_dft_c2r(plan,
+ reinterpret_cast<fftwf_complex*>(in_single.data()),
+ reinterpret_cast<float*>(out.front().data()));
+ single_to_half_inplace(out.front());
+}
+template <>
+inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ fftwf_execute_dft_c2r(plan,
+ reinterpret_cast<fftwf_complex*>(in.front().data()),
+ reinterpret_cast<float*>(out.front().data()));
+}
+template <>
+inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan,
+ std::vector<hostbuf>& in,
+ std::vector<hostbuf>& out)
+{
+ fftw_execute_dft_c2r(plan,
+ reinterpret_cast<fftw_complex*>(in.front().data()),
+ reinterpret_cast<double*>(out.front().data()));
+}
+
+#ifdef FFTW_HAVE_SPRINT_PLAN
+// Template wrappers for FFTW print plan:
+template <typename Tfloat>
+inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan);
+template <>
+inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan)
+{
+ return fftwf_sprint_plan(plan);
+}
+template <>
+inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan)
+{
+ return fftwf_sprint_plan(plan);
+}
+template <>
+inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan)
+{
+ return fftw_sprint_plan(plan);
+}
+#endif
+
+#endif
diff --git a/shared/gpubuf.h b/shared/gpubuf.h
new file mode 100644
index 0000000..993fa95
--- /dev/null
+++ b/shared/gpubuf.h
@@ -0,0 +1,134 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_GPUBUF_H
+#define ROCFFT_GPUBUF_H
+
+#include "rocfft_hip.h"
+#include <cstdlib>
+
+// Simple RAII class for GPU buffers. T is the type of pointer that
+// data() returns
+template <class T = void>
+class gpubuf_t
+{
+public:
+ gpubuf_t() {}
+ // buffers are movable but not copyable
+ gpubuf_t(gpubuf_t&& other)
+ {
+ std::swap(buf, other.buf);
+ std::swap(bsize, other.bsize);
+ std::swap(device, other.device);
+ }
+ gpubuf_t& operator=(gpubuf_t&& other)
+ {
+ std::swap(buf, other.buf);
+ std::swap(bsize, other.bsize);
+ std::swap(device, other.device);
+ return *this;
+ }
+ gpubuf_t(const gpubuf_t&) = delete;
+ gpubuf_t& operator=(const gpubuf_t&) = delete;
+
+ ~gpubuf_t()
+ {
+ free();
+ }
+
+ static bool use_alloc_managed()
+ {
+ return std::getenv("ROCFFT_MALLOC_MANAGED");
+ }
+
+ hipError_t alloc(const size_t size)
+ {
+ // remember the device that was current as of alloc, so we can
+ // free on the correct device
+ auto ret = hipGetDevice(&device);
+ if(ret != hipSuccess)
+ return ret;
+
+ bsize = size;
+ static bool alloc_managed = use_alloc_managed();
+ free();
+ ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize);
+ if(ret != hipSuccess)
+ {
+ buf = nullptr;
+ bsize = 0;
+ }
+ return ret;
+ }
+
+ size_t size() const
+ {
+ return bsize;
+ }
+
+ void free()
+ {
+ if(buf != nullptr)
+ {
+ // free on the device we allocated on
+ rocfft_scoped_device dev(device);
+ (void)hipFree(buf);
+ buf = nullptr;
+ bsize = 0;
+ }
+ }
+
+ // return a pointer to the allocated memory, offset by the
+ // specified number of bytes
+ T* data_offset(size_t offset_bytes = 0) const
+ {
+ void* ptr = static_cast<char*>(buf) + offset_bytes;
+ return static_cast<T*>(ptr);
+ }
+
+ T* data() const
+ {
+ return static_cast<T*>(buf);
+ }
+
+ // equality/bool tests
+ bool operator==(std::nullptr_t n) const
+ {
+ return buf == n;
+ }
+ bool operator!=(std::nullptr_t n) const
+ {
+ return buf != n;
+ }
+ operator bool() const
+ {
+ return buf;
+ }
+
+private:
+ // The GPU buffer
+ void* buf = nullptr;
+ size_t bsize = 0;
+ int device = 0;
+};
+
+// default gpubuf that gives out void* pointers
+typedef gpubuf_t<> gpubuf;
+#endif
diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h
new file mode 100644
index 0000000..54083ab
--- /dev/null
+++ b/shared/hip_object_wrapper.h
@@ -0,0 +1,86 @@
+/******************************************************************************
+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#ifndef ROCFFT_HIP_OBJ_WRAPPER_H
+#define ROCFFT_HIP_OBJ_WRAPPER_H
+
+#include "rocfft_hip.h"
+
+// RAII wrapper around HIP objects
+template <typename T, auto TCreate, auto TDestroy>
+struct hip_object_wrapper_t
+{
+ hip_object_wrapper_t()
+ : obj(nullptr)
+ {
+ }
+
+ void alloc()
+ {
+ if(obj == nullptr && TCreate(&obj) != hipSuccess)
+ throw std::runtime_error("hip create failure");
+ }
+
+ void free()
+ {
+ if(obj)
+ {
+ (void)TDestroy(obj);
+ obj = nullptr;
+ }
+ }
+
+ operator const T&() const
+ {
+ return obj;
+ }
+ operator T&()
+ {
+ return obj;
+ }
+
+ operator bool() const
+ {
+ return obj != nullptr;
+ }
+
+ ~hip_object_wrapper_t()
+ {
+ free();
+ }
+
+ hip_object_wrapper_t(const hip_object_wrapper_t&) = delete;
+ hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete;
+ hip_object_wrapper_t(hip_object_wrapper_t&& other)
+ : obj(other.obj)
+ {
+ other.obj = nullptr;
+ }
+
+private:
+ T obj;
+};
+
+typedef hip_object_wrapper_t<hipStream_t, hipStreamCreate, hipStreamDestroy> hipStream_wrapper_t;
+typedef hip_object_wrapper_t<hipEvent_t, hipEventCreate, hipEventDestroy> hipEvent_wrapper_t;
+
+#endif // ROCFFT_HIP_OBJ_WRAPPER_H
diff --git a/shared/hostbuf.h b/shared/hostbuf.h
new file mode 100644
index 0000000..0a96c7d
--- /dev/null
+++ b/shared/hostbuf.h
@@ -0,0 +1,158 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_HOSTBUF_H
+#define ROCFFT_HOSTBUF_H
+
+#include "arithmetic.h"
+#include <cstdlib>
+#include <cstring>
+
+#ifndef WIN32
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
+// Simple RAII class for host buffers. T is the type of pointer that
+// data() returns
+template <class T = void>
+class hostbuf_t
+{
+public:
+ hostbuf_t() {}
+ // buffers are movable but not copyable
+ hostbuf_t(hostbuf_t&& other)
+ {
+ std::swap(buf, other.buf);
+ std::swap(bsize, other.bsize);
+ }
+ hostbuf_t& operator=(hostbuf_t&& other)
+ {
+ std::swap(buf, other.buf);
+ std::swap(bsize, other.bsize);
+ return *this;
+ }
+ hostbuf_t(const hostbuf_t&) = delete;
+ hostbuf_t& operator=(const hostbuf_t&) = delete;
+
+ ~hostbuf_t()
+ {
+ free();
+ }
+
+ void alloc(size_t size)
+ {
+ bsize = size;
+ free();
+
+ // we're aligning to multiples of 64 bytes, so round the
+ // allocation size up to the nearest 64 to keep ASAN happy
+ if(size % 64)
+ {
+ size += 64 - size % 64;
+ }
+
+ // FFTW requires aligned allocations to use faster SIMD instructions.
+ // If enabling hugepages, align to 2 MiB. Otherwise, aligning to
+ // 64 bytes is enough for AVX instructions up to AVX512.
+#ifdef WIN32
+ buf = _aligned_malloc(size, 64);
+#else
+ // On Linux, ask for hugepages to reduce TLB pressure and
+ // improve performance. Allocations need to be aligned to
+ // the hugepage size, and rounded up to the next whole
+ // hugepage.
+ static const size_t TWO_MiB = 2 * 1024 * 1024;
+ if(size >= TWO_MiB)
+ {
+ size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB;
+ buf = aligned_alloc(TWO_MiB, rounded_size);
+ madvise(buf, rounded_size, MADV_HUGEPAGE);
+ }
+ else
+ buf = aligned_alloc(64, size);
+#endif
+ }
+
+ size_t size() const
+ {
+ return bsize;
+ }
+
+ void free()
+ {
+ if(buf != nullptr)
+ {
+#ifdef WIN32
+ _aligned_free(buf);
+#else
+ std::free(buf);
+#endif
+ buf = nullptr;
+ bsize = 0;
+ }
+ }
+
+ T* data() const
+ {
+ return static_cast<T*>(buf);
+ }
+
+ // Copy method
+ hostbuf_t copy() const
+ {
+ hostbuf_t copy;
+ copy.alloc(bsize);
+ memcpy(copy.buf, buf, bsize);
+ return copy;
+ }
+
+ // shrink the buffer to fit the new size
+ void shrink(size_t new_size)
+ {
+ if(new_size > bsize)
+ throw std::runtime_error("can't shrink hostbuf to larger size");
+ // just pretend the buffer is now that size
+ bsize = new_size;
+ }
+
+ // equality/bool tests
+ bool operator==(std::nullptr_t n) const
+ {
+ return buf == n;
+ }
+ bool operator!=(std::nullptr_t n) const
+ {
+ return buf != n;
+ }
+ operator bool() const
+ {
+ return buf;
+ }
+
+private:
+ // The host buffer
+ void* buf = nullptr;
+ size_t bsize = 0;
+};
+
+// default hostbuf that gives out void* pointers
+typedef hostbuf_t<> hostbuf;
+#endif
diff --git a/shared/increment.h b/shared/increment.h
new file mode 100644
index 0000000..90bba1d
--- /dev/null
+++ b/shared/increment.h
@@ -0,0 +1,100 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_INCREMENT_H
+#define ROCFFT_INCREMENT_H
+
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+// Helper functions to iterate over a buffer in row-major order.
+// Indexes may be given as either a tuple or vector of sizes. They
+// return true if the index was successfully incremented to move to
+// the next element in the buffer.
+
+template <typename T1, typename T2>
+static bool increment_base(T1& index, const T2& length)
+{
+ static_assert(std::is_integral<T1>::value, "Integral required.");
+ static_assert(std::is_integral<T2>::value, "Integral required.");
+ if(index < length - 1)
+ {
+ ++index;
+ return true;
+ }
+ index = 0;
+ return false;
+}
+
+// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length.
+template <typename T1, typename T2>
+static bool increment_rowmajor(T1& index, const T2& length)
+{
+ static_assert(std::is_integral<T1>::value, "Integral required.");
+ static_assert(std::is_integral<T2>::value, "Integral required.");
+ return increment_base(index, length);
+}
+
+template <typename T1, typename T2>
+static bool increment_rowmajor(std::tuple<T1, T1>& index, const std::tuple<T2, T2>& length)
+{
+ if(increment_base(std::get<1>(index), std::get<1>(length)))
+ // we incremented ok, nothing further to do
+ return true;
+ // otherwise, we rolled over
+ return increment_base(std::get<0>(index), std::get<0>(length));
+}
+
+template <typename T1, typename T2>
+static bool increment_rowmajor(std::tuple<T1, T1, T1>& index, const std::tuple<T2, T2, T2>& length)
+{
+ if(increment_base(std::get<2>(index), std::get<2>(length)))
+ // we incremented ok, nothing further to do
+ return true;
+ if(increment_base(std::get<1>(index), std::get<1>(length)))
+ // we incremented ok, nothing further to do
+ return true;
+ // otherwise, we rolled over
+ return increment_base(std::get<0>(index), std::get<0>(length));
+}
+
+// Increment row-major index over arbitrary dimension length
+template <typename T1, typename T2>
+bool increment_rowmajor(std::vector<T1>& index, const std::vector<T2>& length)
+{
+ for(int idim = length.size(); idim-- > 0;)
+ {
+ if(index[idim] < length[idim])
+ {
+ if((++index[idim]) == length[idim])
+ {
+ index[idim] = 0;
+ continue;
+ }
+ // we know we were able to increment something and didn't hit the end
+ return true;
+ }
+ }
+ // End the loop when we get back to the start:
+ return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
+}
+
+#endif
diff --git a/shared/precision_type.h b/shared/precision_type.h
new file mode 100644
index 0000000..526fc9a
--- /dev/null
+++ b/shared/precision_type.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_PRECISION_TYPE_H
+#define ROCFFT_PRECISION_TYPE_H
+
+#include "array_predicate.h"
+#include "rocfft/rocfft.h"
+
+static size_t real_type_size(rocfft_precision precision)
+{
+ switch(precision)
+ {
+ case rocfft_precision_half:
+ return 2;
+ case rocfft_precision_single:
+ return 4;
+ case rocfft_precision_double:
+ return 8;
+ }
+}
+
+static size_t complex_type_size(rocfft_precision precision)
+{
+ return real_type_size(precision) * 2;
+}
+
+static const char* precision_name(rocfft_precision precision)
+{
+ switch(precision)
+ {
+ case rocfft_precision_half:
+ return "half";
+ case rocfft_precision_single:
+ return "single";
+ case rocfft_precision_double:
+ return "double";
+ }
+}
+
+static size_t element_size(rocfft_precision precision, rocfft_array_type array_type)
+{
+ return array_type_is_complex(array_type) ? complex_type_size(precision)
+ : real_type_size(precision);
+}
+
+// offset a pointer by a number of elements, given the elements'
+// precision and type (complex or not)
+static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type)
+{
+ return static_cast<char*>(p) + elems * element_size(precision, type);
+}
+#endif
diff --git a/shared/printbuffer.h b/shared/printbuffer.h
new file mode 100644
index 0000000..5ae0b64
--- /dev/null
+++ b/shared/printbuffer.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef PRINTBUFFER_H
+#define PRINTBUFFER_H
+
+#include "hostbuf.h"
+#include "increment.h"
+#include <algorithm>
+#include <vector>
+
+// Output a formatted general-dimensional array with given length and stride in batches
+// separated by dist.
+template <typename Toutput, typename T1, typename T2, typename Tsize, typename Tstream>
+inline void printbuffer(const Toutput* output,
+ const std::vector<T1>& length,
+ const std::vector<T2>& stride,
+ const Tsize nbatch,
+ const Tsize dist,
+ const size_t offset,
+ Tstream& stream)
+{
+ auto i_base = 0;
+ for(unsigned int b = 0; b < nbatch; b++, i_base += dist)
+ {
+ std::vector<size_t> index(length.size());
+ std::fill(index.begin(), index.end(), 0);
+ do
+ {
+ const int i
+ = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset);
+ stream << output[i] << " ";
+ for(int li = index.size(); li-- > 0;)
+ {
+ if(index[li] == (length[li] - 1))
+ {
+ stream << "\n";
+ }
+ else
+ {
+ break;
+ }
+ }
+ } while(increment_rowmajor(index, length));
+ stream << std::endl;
+ }
+}
+
+template <typename Telem>
+class buffer_printer
+{
+ // The scalar versions might be part of a planar format.
+public:
+ template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream>
+ static void print_buffer(const std::vector<hostbuf>& buf,
+ const std::vector<Tint1>& length,
+ const std::vector<Tint2>& stride,
+ const Tsize nbatch,
+ const Tsize dist,
+ const std::vector<size_t>& offset,
+ Tstream& stream = std::cout)
+ {
+ for(const auto& vec : buf)
+ {
+ printbuffer(reinterpret_cast<const Telem*>(vec.data()),
+ length,
+ stride,
+ nbatch,
+ dist,
+ offset[0],
+ stream);
+ }
+ };
+ template <typename Tstream = std::ostream>
+ static void print_buffer_flat(const std::vector<hostbuf>& buf,
+ const std::vector<size_t>& size,
+ const std::vector<size_t>& offset,
+ Tstream& stream = std::cout)
+ {
+ for(const auto& vec : buf)
+ {
+ auto data = reinterpret_cast<const Telem*>(vec.data());
+ stream << "idx " << 0;
+ for(size_t i = 0; i < size[0]; ++i)
+ stream << " " << data[i];
+ stream << std::endl;
+ }
+ };
+};
+
+#endif
diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h
new file mode 100644
index 0000000..3bd15de
--- /dev/null
+++ b/shared/ptrdiff.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+// Compute the farthest point from the original pointer.
+static size_t compute_ptrdiff(const std::vector<size_t>& length,
+ const std::vector<size_t>& stride,
+ const size_t nbatch,
+ const size_t dist)
+{
+ size_t val = 0;
+ if(!length.empty())
+ {
+ val = 1;
+ for(unsigned int i = 0; i < length.size(); ++i)
+ {
+ val += (length[i] - 1) * stride[i];
+ }
+ val += (nbatch - 1) * dist;
+ }
+ return val;
+}
diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h
new file mode 100644
index 0000000..4ce3059
--- /dev/null
+++ b/shared/rocfft_accuracy_test.h
@@ -0,0 +1,29 @@
+// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_ACCURACY_TEST
+#define ROCFFT_ACCURACY_TEST
+
+#include "accuracy_test.h"
+#include "rocfft_params.h"
+
+void fft_vs_reference(rocfft_params& params, bool round_trip = false);
+
+#endif
diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h
new file mode 100644
index 0000000..d03754c
--- /dev/null
+++ b/shared/rocfft_against_fftw.h
@@ -0,0 +1,231 @@
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+#ifndef ROCFFT_AGAINST_FFTW
+#define ROCFFT_AGAINST_FFTW
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdexcept>
+#include <vector>
+
+#include "fftw_transform.h"
+
+// Return the precision enum for rocFFT based upon the type.
+template <typename Tfloat>
+inline fft_precision precision_selector();
+template <>
+inline fft_precision precision_selector<float>()
+{
+ return fft_precision_single;
+}
+template <>
+inline fft_precision precision_selector<double>()
+{
+ return fft_precision_double;
+}
+
+extern bool use_fftw_wisdom;
+
+// construct and return an FFTW plan with the specified type,
+// precision, and dimensions. cpu_out is required if we're using
+// wisdom, which runs actual FFTs to work out the best plan.
+template <typename Tfloat>
+static typename fftw_trait<Tfloat>::fftw_plan_type
+ fftw_plan_with_precision(const std::vector<fftw_iodim64>& dims,
+ const std::vector<fftw_iodim64>& howmany_dims,
+ const fft_transform_type transformType,
+ const size_t isize,
+ void* cpu_in,
+ void* cpu_out)
+{
+ using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type;
+
+ // NB: Using FFTW_MEASURE implies that the input buffer's data
+ // may be destroyed during plan creation. But if we're wanting
+ // to run FFTW in the first place, we must have just created an
+ // uninitialized input buffer anyway.
+
+ switch(transformType)
+ {
+ case fft_transform_type_complex_forward:
+ return fftw_plan_guru64_dft<Tfloat>(dims.size(),
+ dims.data(),
+ howmany_dims.size(),
+ howmany_dims.data(),
+ reinterpret_cast<fftw_complex_type*>(cpu_in),
+ reinterpret_cast<fftw_complex_type*>(cpu_out),
+ -1,
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+ case fft_transform_type_complex_inverse:
+ return fftw_plan_guru64_dft<Tfloat>(dims.size(),
+ dims.data(),
+ howmany_dims.size(),
+ howmany_dims.data(),
+ reinterpret_cast<fftw_complex_type*>(cpu_in),
+ reinterpret_cast<fftw_complex_type*>(cpu_out),
+ 1,
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+ case fft_transform_type_real_forward:
+ return fftw_plan_guru64_r2c<Tfloat>(dims.size(),
+ dims.data(),
+ howmany_dims.size(),
+ howmany_dims.data(),
+ reinterpret_cast<Tfloat*>(cpu_in),
+ reinterpret_cast<fftw_complex_type*>(cpu_out),
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+ case fft_transform_type_real_inverse:
+ return fftw_plan_guru64_c2r<Tfloat>(dims.size(),
+ dims.data(),
+ howmany_dims.size(),
+ howmany_dims.data(),
+ reinterpret_cast<fftw_complex_type*>(cpu_in),
+ reinterpret_cast<Tfloat*>(cpu_out),
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+}
+
+// construct an FFTW plan, given rocFFT parameters. output is
+// required if planning with wisdom.
+template <typename Tfloat>
+static typename fftw_trait<Tfloat>::fftw_plan_type
+ fftw_plan_via_rocfft(const std::vector<size_t>& length,
+ const std::vector<size_t>& istride,
+ const std::vector<size_t>& ostride,
+ const size_t nbatch,
+ const size_t idist,
+ const size_t odist,
+ const fft_transform_type transformType,
+ std::vector<hostbuf>& input,
+ std::vector<hostbuf>& output)
+{
+ // Dimension configuration:
+ std::vector<fftw_iodim64> dims(length.size());
+ for(unsigned int idx = 0; idx < length.size(); ++idx)
+ {
+ dims[idx].n = length[idx];
+ dims[idx].is = istride[idx];
+ dims[idx].os = ostride[idx];
+ }
+
+ // Batch configuration:
+ std::vector<fftw_iodim64> howmany_dims(1);
+ howmany_dims[0].n = nbatch;
+ howmany_dims[0].is = idist;
+ howmany_dims[0].os = odist;
+
+ return fftw_plan_with_precision<Tfloat>(dims,
+ howmany_dims,
+ transformType,
+ idist * nbatch,
+ input.front().data(),
+ output.empty() ? nullptr : output.front().data());
+}
+
+template <typename Tfloat>
+void fftw_run(fft_transform_type transformType,
+ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan,
+ std::vector<hostbuf>& cpu_in,
+ std::vector<hostbuf>& cpu_out)
+{
+ switch(transformType)
+ {
+ case fft_transform_type_complex_forward:
+ {
+ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
+ break;
+ }
+ case fft_transform_type_complex_inverse:
+ {
+ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
+ break;
+ }
+ case fft_transform_type_real_forward:
+ {
+ fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
+ break;
+ }
+ case fft_transform_type_real_inverse:
+ {
+ fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out);
+ break;
+ }
+ }
+}
+
+// Given a transform type, return the contiguous input type.
+inline fft_array_type contiguous_itype(const fft_transform_type transformType)
+{
+ switch(transformType)
+ {
+ case fft_transform_type_complex_forward:
+ case fft_transform_type_complex_inverse:
+ return fft_array_type_complex_interleaved;
+ case fft_transform_type_real_forward:
+ return fft_array_type_real;
+ case fft_transform_type_real_inverse:
+ return fft_array_type_hermitian_interleaved;
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+ return fft_array_type_complex_interleaved;
+}
+
+// Given a transform type, return the contiguous output type.
+inline fft_array_type contiguous_otype(const fft_transform_type transformType)
+{
+ switch(transformType)
+ {
+ case fft_transform_type_complex_forward:
+ case fft_transform_type_complex_inverse:
+ return fft_array_type_complex_interleaved;
+ case fft_transform_type_real_forward:
+ return fft_array_type_hermitian_interleaved;
+ case fft_transform_type_real_inverse:
+ return fft_array_type_real;
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+ return fft_array_type_complex_interleaved;
+}
+
+// Given a precision, return the acceptable tolerance.
+inline double type_epsilon(const fft_precision precision)
+{
+ switch(precision)
+ {
+ case fft_precision_half:
+ return type_epsilon<_Float16>();
+ break;
+ case fft_precision_single:
+ return type_epsilon<float>();
+ break;
+ case fft_precision_double:
+ return type_epsilon<double>();
+ break;
+ default:
+ throw std::runtime_error("Invalid precision");
+ }
+}
+
+#endif
diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h
new file mode 100644
index 0000000..efa0290
--- /dev/null
+++ b/shared/rocfft_complex.h
@@ -0,0 +1,346 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_COMPLEX_H
+#define ROCFFT_COMPLEX_H
+
+#include <hip/hip_fp16.h>
+#if !defined(__HIPCC_RTC__)
+#include <iostream>
+#endif
+#include <math.h>
+#include <type_traits>
+
+#ifdef __HIP_PLATFORM_NVIDIA__
+typedef __half _Float16;
+#endif
+
+template <typename Treal>
+struct rocfft_complex
+{
+
+ Treal x; // Real part
+ Treal y; // Imaginary part
+
+ // Constructors
+ // Do not initialize the members x or y by default, to ensure that it can
+ // be used in __shared__ and that it is a trivial class compatible with C.
+ __device__ __host__ rocfft_complex() = default;
+ __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
+ __device__ __host__ rocfft_complex(rocfft_complex&&) = default;
+ __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
+ __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
+ __device__ __host__ ~rocfft_complex() = default;
+
+ // Constructor from real and imaginary parts
+ __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
+ : x{real}
+ , y{imag}
+ {
+ }
+
+ // Conversion from different precision
+ template <typename U>
+ __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
+ : x(z.x)
+ , y(z.y)
+ {
+ }
+
+ // Accessors
+ __device__ __host__ constexpr Treal real() const
+ {
+ return x;
+ }
+
+ __device__ __host__ constexpr Treal imag() const
+ {
+ return y;
+ }
+
+ // Unary operations
+ __forceinline__ __device__ __host__ rocfft_complex operator-() const
+ {
+ return {-x, -y};
+ }
+
+ __forceinline__ __device__ __host__ rocfft_complex operator+() const
+ {
+ return *this;
+ }
+
+ __device__ __host__ Treal asum(const rocfft_complex& z)
+ {
+ return abs(z.x) + abs(z.y);
+ }
+
+ // Internal real functions
+ static __forceinline__ __device__ __host__ Treal abs(Treal x)
+ {
+ return x < 0 ? -x : x;
+ }
+
+ static __forceinline__ __device__ __host__ float sqrt(float x)
+ {
+ return ::sqrtf(x);
+ }
+
+ static __forceinline__ __device__ __host__ double sqrt(double x)
+ {
+ return ::sqrt(x);
+ }
+
+ // Addition operators
+ __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
+ {
+ return *this = {x + rhs.x, y + rhs.y};
+ }
+
+ __device__ __host__ auto operator+(const rocfft_complex& rhs) const
+ {
+ auto lhs = *this;
+ return lhs += rhs;
+ }
+
+ // Subtraction operators
+ __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
+ {
+ return *this = {x - rhs.x, y - rhs.y};
+ }
+
+ __device__ __host__ auto operator-(const rocfft_complex& rhs) const
+ {
+ auto lhs = *this;
+ return lhs -= rhs;
+ }
+
+ // Multiplication operators
+ __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
+ {
+ return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
+ }
+
+ __device__ __host__ auto operator*(const rocfft_complex& rhs) const
+ {
+ auto lhs = *this;
+ return lhs *= rhs;
+ }
+
+ // Division operators
+ __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
+ {
+ // Form of Robert L. Smith's Algorithm 116
+ if(abs(rhs.x) > abs(rhs.y))
+ {
+ Treal ratio = rhs.y / rhs.x;
+ Treal scale = 1 / (rhs.x + rhs.y * ratio);
+ *this = {(x + y * ratio) * scale, (y - x * ratio) * scale};
+ }
+ else
+ {
+ Treal ratio = rhs.x / rhs.y;
+ Treal scale = 1 / (rhs.x * ratio + rhs.y);
+ *this = {(y + x * ratio) * scale, (y * ratio - x) * scale};
+ }
+ return *this;
+ }
+
+ __device__ __host__ auto operator/(const rocfft_complex& rhs) const
+ {
+ auto lhs = *this;
+ return lhs /= rhs;
+ }
+
+ // Comparison operators
+ __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
+ {
+ return x == rhs.x && y == rhs.y;
+ }
+
+ __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+ // Operators for complex-real computations
+ template <typename U>
+ __device__ __host__ auto& operator+=(const U& rhs)
+ {
+ return (x += Treal(rhs)), *this;
+ }
+
+ template <typename U>
+ __device__ __host__ auto& operator-=(const U& rhs)
+ {
+ return (x -= Treal(rhs)), *this;
+ }
+
+ __device__ __host__ auto operator+(const Treal& rhs)
+ {
+ auto lhs = *this;
+ return lhs += rhs;
+ }
+
+ __device__ __host__ auto operator-(const Treal& rhs)
+ {
+ auto lhs = *this;
+ return lhs -= rhs;
+ }
+
+ template <typename U>
+ __device__ __host__ auto& operator*=(const U& rhs)
+ {
+ return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
+ }
+
+ template <typename U>
+ __device__ __host__ auto operator*(const U& rhs) const
+ {
+ auto lhs = *this;
+ return lhs *= Treal(rhs);
+ }
+
+ template <typename U>
+ __device__ __host__ auto& operator/=(const U& rhs)
+ {
+ return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
+ }
+
+ template <typename U>
+ __device__ __host__ auto operator/(const U& rhs) const
+ {
+ auto lhs = *this;
+ return lhs /= Treal(rhs);
+ }
+
+ template <typename U>
+ __device__ __host__ constexpr bool operator==(const U& rhs) const
+ {
+ return x == Treal(rhs) && y == 0;
+ }
+
+ template <typename U>
+ __device__ __host__ constexpr bool operator!=(const U& rhs) const
+ {
+ return !(*this == rhs);
+ }
+};
+
+// Stream operators
+#if !defined(__HIPCC_RTC__)
+static std::ostream& operator<<(std::ostream& stream, const _Float16& f)
+{
+ return stream << static_cast<double>(f);
+}
+
+template <typename Treal>
+std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
+{
+ return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
+}
+#endif
+
+// Operators for real-complex computations
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+ return {Treal(lhs) + rhs.x, rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+ return {Treal(lhs) - rhs.x, -rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+ return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+ // Form of Robert L. Smith's Algorithm 116
+ if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
+ {
+ Treal ratio = rhs.y / rhs.x;
+ Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
+ return {scale, -scale * ratio};
+ }
+ else
+ {
+ Treal ratio = rhs.x / rhs.y;
+ Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
+ return {ratio * scale, -scale};
+ }
+}
+
+template <typename U, typename Treal>
+__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+ return Treal(lhs) == rhs.x && 0 == rhs.y;
+}
+
+template <typename U, typename Treal>
+__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+ return !(lhs == rhs);
+}
+
+// Extending std namespace to handle rocfft_complex datatype
+namespace std
+{
+ template <typename Treal>
+ __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
+ {
+ return z.x;
+ }
+
+ template <typename Treal>
+ __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
+ {
+ return z.y;
+ }
+
+ template <typename Treal>
+ __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
+ {
+ return {z.x, -z.y};
+ }
+
+ template <typename Treal>
+ __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
+ {
+ return (z.x * z.x) + (z.y * z.y);
+ }
+
+ template <typename Treal>
+ __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
+ {
+ Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
+ return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
+ : ti ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
+ : 0;
+ }
+}
+
+#endif // ROCFFT_COMPLEX_H
diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h
new file mode 100644
index 0000000..e086cab
--- /dev/null
+++ b/shared/rocfft_hip.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef __ROCFFT_HIP_H__
+#define __ROCFFT_HIP_H__
+
+#include <hip/hip_runtime_api.h>
+#include <stdexcept>
+
+class rocfft_scoped_device
+{
+public:
+ rocfft_scoped_device(int device)
+ {
+ if(hipGetDevice(&orig_device) != hipSuccess)
+ throw std::runtime_error("hipGetDevice failure");
+
+ if(hipSetDevice(device) != hipSuccess)
+ throw std::runtime_error("hipSetDevice failure");
+ }
+ ~rocfft_scoped_device()
+ {
+ (void)hipSetDevice(orig_device);
+ }
+
+ // not copyable or movable
+ rocfft_scoped_device(const rocfft_scoped_device&) = delete;
+ rocfft_scoped_device(rocfft_scoped_device&&) = delete;
+ rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete;
+
+private:
+ int orig_device;
+};
+
+#endif // __ROCFFT_HIP_H__
diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h
new file mode 100644
index 0000000..bf9b728
--- /dev/null
+++ b/shared/rocfft_params.h
@@ -0,0 +1,585 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_PARAMS_H
+#define ROCFFT_PARAMS_H
+
+#include "../shared/fft_params.h"
+#include "../shared/gpubuf.h"
+#include "rocfft/rocfft.h"
+
+// Return the string of the rocfft_status code
+static std::string rocfft_status_to_string(const rocfft_status ret)
+{
+ switch(ret)
+ {
+ case rocfft_status_success:
+ return "rocfft_status_success";
+ case rocfft_status_failure:
+ return "rocfft_status_failure";
+ case rocfft_status_invalid_arg_value:
+ return "rocfft_status_invalid_arg_value";
+ case rocfft_status_invalid_dimensions:
+ return "rocfft_status_invalid_dimensions";
+ case rocfft_status_invalid_array_type:
+ return "rocfft_status_invalid_array_type";
+ case rocfft_status_invalid_strides:
+ return "rocfft_status_invalid_strides";
+ case rocfft_status_invalid_distance:
+ return "rocfft_status_invalid_distance";
+ case rocfft_status_invalid_offset:
+ return "rocfft_status_invalid_offset";
+ case rocfft_status_invalid_work_buffer:
+ return "rocfft_status_invalid_work_buffer";
+ default:
+ throw std::runtime_error("unknown rocfft_status");
+ }
+}
+
+inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
+{
+ switch(val)
+ {
+ case rocfft_status_success:
+ return fft_status_success;
+ case rocfft_status_failure:
+ return fft_status_failure;
+ case rocfft_status_invalid_arg_value:
+ return fft_status_invalid_arg_value;
+ case rocfft_status_invalid_dimensions:
+ return fft_status_invalid_dimensions;
+ case rocfft_status_invalid_array_type:
+ return fft_status_invalid_array_type;
+ case rocfft_status_invalid_strides:
+ return fft_status_invalid_strides;
+ case rocfft_status_invalid_distance:
+ return fft_status_invalid_distance;
+ case rocfft_status_invalid_offset:
+ return fft_status_invalid_offset;
+ case rocfft_status_invalid_work_buffer:
+ return fft_status_invalid_work_buffer;
+ default:
+ throw std::runtime_error("Invalid status");
+ }
+}
+
+inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
+{
+ switch(val)
+ {
+ case fft_precision_single:
+ return rocfft_precision_single;
+ case fft_precision_double:
+ return rocfft_precision_double;
+ case fft_precision_half:
+ return rocfft_precision_half;
+ default:
+ throw std::runtime_error("Invalid precision");
+ }
+}
+
+inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
+{
+ switch(val)
+ {
+ case fft_array_type_complex_interleaved:
+ return rocfft_array_type_complex_interleaved;
+ case fft_array_type_complex_planar:
+ return rocfft_array_type_complex_planar;
+ case fft_array_type_real:
+ return rocfft_array_type_real;
+ case fft_array_type_hermitian_interleaved:
+ return rocfft_array_type_hermitian_interleaved;
+ case fft_array_type_hermitian_planar:
+ return rocfft_array_type_hermitian_planar;
+ case fft_array_type_unset:
+ return rocfft_array_type_unset;
+ }
+ return rocfft_array_type_unset;
+}
+
+inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
+{
+ switch(val)
+ {
+ case fft_transform_type_complex_forward:
+ return rocfft_transform_type_complex_forward;
+ case fft_transform_type_complex_inverse:
+ return rocfft_transform_type_complex_inverse;
+ case fft_transform_type_real_forward:
+ return rocfft_transform_type_real_forward;
+ case fft_transform_type_real_inverse:
+ return rocfft_transform_type_real_inverse;
+ default:
+ throw std::runtime_error("Invalid transform type");
+ }
+}
+
+inline rocfft_result_placement
+ rocfft_result_placement_from_fftparams(const fft_result_placement val)
+{
+ switch(val)
+ {
+ case fft_placement_inplace:
+ return rocfft_placement_inplace;
+ case fft_placement_notinplace:
+ return rocfft_placement_notinplace;
+ default:
+ throw std::runtime_error("Invalid result placement");
+ }
+}
+
+class rocfft_params : public fft_params
+{
+public:
+ rocfft_plan plan = nullptr;
+ rocfft_execution_info info = nullptr;
+ rocfft_plan_description desc = nullptr;
+ gpubuf_t<void> wbuffer;
+
+ explicit rocfft_params(){};
+
+ explicit rocfft_params(const fft_params& p)
+ : fft_params(p){};
+
+ rocfft_params(const rocfft_params&) = delete;
+ rocfft_params& operator=(const rocfft_params&) = delete;
+
+ ~rocfft_params()
+ {
+ free();
+ };
+
+ void free()
+ {
+ if(plan != nullptr)
+ {
+ rocfft_plan_destroy(plan);
+ plan = nullptr;
+ }
+ if(info != nullptr)
+ {
+ rocfft_execution_info_destroy(info);
+ info = nullptr;
+ }
+ if(desc != nullptr)
+ {
+ rocfft_plan_description_destroy(desc);
+ desc = nullptr;
+ }
+ wbuffer.free();
+ }
+
+ void validate_fields() const override
+ {
+ // row-major lengths including batch (i.e. batch is at the front)
+ std::vector<size_t> length_with_batch{nbatch};
+ std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch));
+
+ auto validate_field = [&](const fft_field& f) {
+ for(const auto& b : f.bricks)
+ {
+ // bricks must have same dim as FFT, including batch
+ if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1
+ || b.stride.size() != length.size() + 1)
+ throw std::runtime_error(
+ "brick dimension does not match FFT + batch dimension");
+
+ // ensure lower < upper, and that both fit in the FFT + batch dims
+ if(!std::lexicographical_compare(
+ b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end()))
+ throw std::runtime_error("brick lower index is not less than upper index");
+
+ if(!std::lexicographical_compare(b.lower.begin(),
+ b.lower.end(),
+ length_with_batch.begin(),
+ length_with_batch.end()))
+ throw std::runtime_error(
+ "brick lower index is not less than FFT + batch length");
+
+ if(!std::lexicographical_compare(b.upper.begin(),
+ b.upper.end(),
+ length_with_batch.begin(),
+ length_with_batch.end())
+ && b.upper != length_with_batch)
+ throw std::runtime_error("brick upper index is not <= FFT + batch length");
+ }
+ };
+
+ for(const auto& ifield : ifields)
+ validate_field(ifield);
+ for(const auto& ofield : ofields)
+ validate_field(ofield);
+ }
+
+ rocfft_precision get_rocfft_precision()
+ {
+ return rocfft_precision_from_fftparams(precision);
+ }
+
+ size_t vram_footprint() override
+ {
+ size_t val = fft_params::vram_footprint();
+ if(setup_structs() != fft_status_success)
+ {
+ throw std::runtime_error("Struct setup failed");
+ }
+ val += workbuffersize;
+
+ return val;
+ }
+
+ // Convert the generic fft_field structure to a rocfft_field
+ // structure that can be passed to rocFFT. In particular, we need
+ // to convert from row-major to column-major.
+ static rocfft_field fft_field_to_rocfft_field(const fft_field& f)
+ {
+ rocfft_field rfield = nullptr;
+ if(f.bricks.empty())
+ return rfield;
+
+ if(rocfft_field_create(&rfield) != rocfft_status_success)
+ throw std::runtime_error("rocfft_field_create failed");
+ for(const auto& b : f.bricks)
+ {
+ // rocFFT wants column-major bricks and fft_params stores
+ // row-major
+ std::vector<size_t> lower_cm;
+ std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm));
+ std::vector<size_t> upper_cm;
+ std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm));
+ std::vector<size_t> stride_cm;
+ std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm));
+
+ rocfft_brick rbrick = nullptr;
+ if(rocfft_brick_create(&rbrick,
+ lower_cm.data(), // field_lower
+ upper_cm.data(), // field_upper
+ stride_cm.data(), // brick_stride
+ lower_cm.size(), // dim
+ b.device) // deviceID
+ != rocfft_status_success)
+ throw std::runtime_error("rocfft_brick_create failed");
+
+ if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success)
+ throw std::runtime_error("rocfft_field_add_brick failed");
+
+ rocfft_brick_destroy(rbrick);
+ }
+ return rfield;
+ }
+
+ fft_status setup_structs()
+ {
+ rocfft_status fft_status = rocfft_status_success;
+ if(desc == nullptr)
+ {
+ rocfft_plan_description_create(&desc);
+ if(fft_status != rocfft_status_success)
+ return fft_status_from_rocfftparams(fft_status);
+
+ fft_status
+ = rocfft_plan_description_set_data_layout(desc,
+ rocfft_array_type_from_fftparams(itype),
+ rocfft_array_type_from_fftparams(otype),
+ ioffset.data(),
+ ooffset.data(),
+ istride_cm().size(),
+ istride_cm().data(),
+ idist,
+ ostride_cm().size(),
+ ostride_cm().data(),
+ odist);
+ if(fft_status != rocfft_status_success)
+ {
+ throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
+ }
+
+ if(scale_factor != 1.0)
+ {
+ fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
+ if(fft_status != rocfft_status_success)
+ {
+ throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
+ }
+ }
+
+ for(const auto& ifield : ifields)
+ {
+ rocfft_field infield = fft_field_to_rocfft_field(ifield);
+ if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success)
+ throw std::runtime_error("rocfft_description_add_infield failed");
+ rocfft_field_destroy(infield);
+ }
+
+ for(const auto& ofield : ofields)
+ {
+ rocfft_field outfield = fft_field_to_rocfft_field(ofield);
+ if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success)
+ throw std::runtime_error("rocfft_description_add_outfield failed");
+ rocfft_field_destroy(outfield);
+ }
+ }
+
+ if(plan == nullptr)
+ {
+ fft_status = rocfft_plan_create(&plan,
+ rocfft_result_placement_from_fftparams(placement),
+ rocfft_transform_type_from_fftparams(transform_type),
+ get_rocfft_precision(),
+ length_cm().size(),
+ length_cm().data(),
+ nbatch,
+ desc);
+ if(fft_status != rocfft_status_success)
+ {
+ throw std::runtime_error("rocfft_plan_create failed");
+ }
+ }
+
+ if(info == nullptr)
+ {
+ fft_status = rocfft_execution_info_create(&info);
+ if(fft_status != rocfft_status_success)
+ {
+ throw std::runtime_error("rocfft_execution_info_create failed");
+ }
+ }
+
+ fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
+ if(fft_status != rocfft_status_success)
+ {
+ throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
+ }
+
+ return fft_status_from_rocfftparams(fft_status);
+ }
+
+ fft_status create_plan() override
+ {
+ fft_status ret = setup_structs();
+ if(ret != fft_status_success)
+ {
+ return ret;
+ }
+ if(workbuffersize > 0)
+ {
+ hipError_t hip_status = hipSuccess;
+ hip_status = wbuffer.alloc(workbuffersize);
+ if(hip_status != hipSuccess)
+ {
+ std::ostringstream oss;
+ oss << "work buffer allocation failed (" << workbuffersize << " requested)";
+ size_t mem_free = 0;
+ size_t mem_total = 0;
+ hip_status = hipMemGetInfo(&mem_free, &mem_total);
+ if(hip_status == hipSuccess)
+ {
+ oss << "free vram: " << mem_free << " total vram: " << mem_total;
+ }
+ else
+ {
+ oss << "hipMemGetInfo also failed";
+ }
+ throw work_buffer_alloc_failure(oss.str());
+ }
+
+ auto rocret
+ = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
+ if(rocret != rocfft_status_success)
+ {
+ throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
+ }
+ }
+
+ return ret;
+ }
+
+ fft_status set_callbacks(void* load_cb_host,
+ void* load_cb_data,
+ void* store_cb_host,
+ void* store_cb_data) override
+ {
+ if(run_callbacks)
+ {
+ auto roc_status
+ = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
+ if(roc_status != rocfft_status_success)
+ return fft_status_from_rocfftparams(roc_status);
+
+ roc_status
+ = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
+ if(roc_status != rocfft_status_success)
+ return fft_status_from_rocfftparams(roc_status);
+ }
+ return fft_status_success;
+ }
+
+ fft_status execute(void** in, void** out) override
+ {
+ auto ret = rocfft_execute(plan, in, out, info);
+ return fft_status_from_rocfftparams(ret);
+ }
+
+ // scatter data to multiple GPUs and adjust I/O buffers to match
+ void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
+ std::vector<void*>& pibuffer,
+ std::vector<void*>& pobuffer) override
+ {
+ auto alloc_fields = [&](const fft_params::fft_field& field,
+ fft_array_type array_type,
+ std::vector<void*>& pbuffer,
+ bool copy_input) {
+ if(field.bricks.empty())
+ return;
+
+ // we have a field defined, clear the list of buffers as
+ // we'll be allocating new ones for each brick
+ pbuffer.clear();
+
+ for(const auto& b : field.bricks)
+ {
+ // get brick's length - note that this includes batch
+ // dimension
+ const auto brick_len = b.length();
+ const auto brick_stride = b.stride;
+
+ const size_t brick_size_elems = product(brick_len.begin(), brick_len.end());
+ const size_t elem_size_bytes = var_size<size_t>(precision, array_type);
+ const size_t brick_size_bytes = brick_size_elems * elem_size_bytes;
+
+ // set device for the alloc, but we want to return to the
+ // default device as the source of a following memcpy
+ {
+ rocfft_scoped_device dev(b.device);
+ multi_gpu_data.emplace_back();
+ if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess)
+ throw std::runtime_error("device allocation failure");
+ pbuffer.push_back(multi_gpu_data.back().data());
+ }
+
+ if(copy_input)
+ {
+ // For now, assume we're only splitting on highest FFT
+ // dimension, lower-dimensional FFT data is all
+ // contiguous, and batches are contiguous in each brick.
+ //
+ // That means we can express this as a 2D memcpy.
+ const size_t unbatched_elems_per_brick
+ = product(brick_len.begin() + 1, brick_len.end());
+ const size_t unbatched_elems_per_fft = product(length.begin(), length.end());
+
+ // get this brick's starting offset in the field
+ const size_t brick_offset
+ = b.lower_field_offset(istride, idist) * elem_size_bytes;
+
+ // copy from original input - note that we're
+ // assuming interleaved data so ibuffer has only one
+ // gpubuf
+ if(hipMemcpy2D(pbuffer.back(),
+ unbatched_elems_per_brick * elem_size_bytes,
+ ibuffer.front().data_offset(brick_offset),
+ unbatched_elems_per_fft * elem_size_bytes,
+ unbatched_elems_per_brick * elem_size_bytes,
+ brick_len.front(),
+ hipMemcpyHostToDevice)
+ != hipSuccess)
+ throw std::runtime_error("hipMemcpy failure");
+ }
+ }
+
+ // if we copied the input to all the other devices, and
+ // this is an out-of-place transform, we no longer
+ // need the original input
+ if(copy_input && placement == fft_placement_notinplace)
+ ibuffer.clear();
+ };
+
+ // assume one input, one output field for simple cases
+ if(!ifields.empty())
+ alloc_fields(ifields.front(), itype, pibuffer, true);
+ if(!ofields.empty())
+ {
+ if(!ifields.empty() && placement == fft_placement_inplace)
+ pobuffer = pibuffer;
+ else
+ alloc_fields(ofields.front(), otype, pobuffer, false);
+ }
+ }
+
+ // when preparing for multi-GPU transform, we need to allocate data
+ // on each GPU. This vector remembers all of those allocations.
+ std::vector<gpubuf> multi_gpu_data;
+
+ // gather data after multi-GPU FFT for verification
+ void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) override
+ {
+ if(ofields.empty())
+ return;
+
+ for(size_t i = 0; i < ofields.front().bricks.size(); ++i)
+ {
+ const auto& b = ofields.front().bricks[i];
+ const auto& brick_ptr = pobuffer[i];
+
+ const auto brick_len = b.length();
+
+ const size_t elem_size_bytes = var_size<size_t>(precision, otype);
+
+ // get this brick's starting offset in the field
+ const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes;
+
+ // switch device to where we're copying from
+ rocfft_scoped_device dev(b.device);
+
+ // For now, assume we're only splitting on highest FFT
+ // dimension, lower-dimensional FFT data is all
+ // contiguous, and batches are contiguous in each brick.
+ //
+ // That means we can express this as a 2D memcpy.
+ const size_t unbatched_elems_per_brick
+ = product(brick_len.begin() + 1, brick_len.end());
+ const auto output_length = olength();
+ const size_t unbatched_elems_per_fft
+ = product(output_length.begin(), output_length.end());
+
+ // copy to original output buffer - note that
+ // we're assuming interleaved data so obuffer
+ // has only one gpubuf
+ if(hipMemcpy2D(obuffer.front().data_offset(brick_offset),
+ unbatched_elems_per_fft * elem_size_bytes,
+ brick_ptr,
+ unbatched_elems_per_brick * elem_size_bytes,
+ unbatched_elems_per_brick * elem_size_bytes,
+ brick_len.front(),
+ hipMemcpyDeviceToDevice)
+ != hipSuccess)
+ throw std::runtime_error("hipMemcpy failure");
+
+ // device-to-device transfers don't synchronize with the
+ // host, add explicit sync
+ (void)hipDeviceSynchronize();
+ }
+ pobuffer.clear();
+ pobuffer.push_back(obuffer.front().data());
+ }
+};
+
+#endif
diff --git a/shared/test_params.h b/shared/test_params.h
new file mode 100644
index 0000000..8d8f6f7
--- /dev/null
+++ b/shared/test_params.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+#ifndef TESTCONSTANTS_H
+#define TESTCONSTANTS_H
+
+#include <stdexcept>
+
+extern int verbose;
+extern size_t ramgb;
+extern size_t vramgb;
+
+extern size_t n_random_tests;
+
+extern size_t random_seed;
+extern double planar_prob;
+extern double callback_prob;
+
+extern double half_epsilon;
+extern double single_epsilon;
+extern double double_epsilon;
+extern bool skip_runtime_fails;
+
+extern double max_linf_eps_double;
+extern double max_l2_eps_double;
+extern double max_linf_eps_single;
+extern double max_l2_eps_single;
+extern double max_linf_eps_half;
+extern double max_l2_eps_half;
+
+extern int n_hip_failures;
+
+#endif
diff --git a/shared/work_queue.h b/shared/work_queue.h
new file mode 100644
index 0000000..e13fc41
--- /dev/null
+++ b/shared/work_queue.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+template <typename _WorkItem>
+struct WorkQueue
+{
+ void push(_WorkItem&& i)
+ {
+ std::unique_lock<std::mutex> lock(queueMutex);
+ items.emplace(std::move(i));
+ emptyWait.notify_all();
+ }
+ _WorkItem pop()
+ {
+ std::unique_lock<std::mutex> lock(queueMutex);
+ while(items.empty())
+ emptyWait.wait(lock);
+ _WorkItem item(items.front());
+ items.pop();
+ return item;
+ }
+
+private:
+ std::queue<_WorkItem> items;
+ std::mutex queueMutex;
+ std::condition_variable emptyWait;
+};