
* Bump up the version for ROCm-6.0.0 * Adding patch files * Style check failure fix * Style check fixes * Style check error fixes * Patch to remove hipblas client file installation in 6.0 * Patch need to be applied on all 5.7 relases * 6.0 update for math libs and other packages, new github url etc * Correct package-audit failures * Correcting shasum for rocfft patch and limiting patch in rocblas * Reverting updates in rocprofiler-dev due to ci-gitlab failure * Fixes for ci-gitlab failure due to disabling hip backward compatibilit * Adding patch file to Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA * Use the gcnArchName inplace of gcnArch as gcnArch is deprecated from rocm-6.0.0 * Patches to fix magma and blaspp build error with rocm 6.0.0 * Patch for mfem and arborx for rocm 6.0 * Style check error fix * Correcting style check errors * Uodating dependent version * Update for petsc to build with rocm 6.0 Need reverting-operator-mixup-fix-for-slate.patch for rocm 6.0 * Reverting the change in url for 2.7.4-rocm-enhanced * hip-tensor 6.0.0 update
11432 lines
429 KiB
Diff
11432 lines
429 KiB
Diff
From 27ae15a459f45f1acfcb1a9b1c8d491d9f731fd4 Mon Sep 17 00:00:00 2001
|
|
From: Steve Leung <Steve.Leung@amd.com>
|
|
Date: Thu, 4 Jan 2024 16:36:08 -0700
|
|
Subject: [PATCH] remove submodule and sync shared files from rocFFT, update
|
|
CHANGELOG.md
|
|
|
|
---
|
|
clients/CMakeLists.txt | 15 -
|
|
clients/bench/CMakeLists.txt | 4 +-
|
|
clients/bench/bench.cpp | 2 +-
|
|
clients/hipfft_params.h | 2 +-
|
|
clients/tests/CMakeLists.txt | 11 +-
|
|
clients/tests/accuracy_test_1D.cpp | 8 +-
|
|
clients/tests/accuracy_test_2D.cpp | 8 +-
|
|
clients/tests/accuracy_test_3D.cpp | 8 +-
|
|
clients/tests/accuracy_test_callback.cpp | 2 +-
|
|
clients/tests/gtest_main.cpp | 6 +-
|
|
clients/tests/hipfft_accuracy_test.cpp | 11 +-
|
|
clients/tests/hipfft_accuracy_test.h | 2 +-
|
|
clients/tests/multi_device_test.cpp | 2 +-
|
|
cmake/dependencies.cmake | 3 -
|
|
library/src/amd_detail/hipfft.cpp | 8 +-
|
|
shared/accuracy_test.h | 1949 +++++++++++++
|
|
shared/arithmetic.h | 61 +
|
|
shared/array_predicate.h | 47 +
|
|
shared/array_validator.cpp | 549 ++++
|
|
shared/array_validator.h | 31 +
|
|
shared/concurrency.h | 41 +
|
|
shared/data_gen_device.h | 1303 +++++++++
|
|
shared/data_gen_host.h | 881 ++++++
|
|
shared/device_properties.h | 74 +
|
|
shared/enum_to_string.h | 81 +
|
|
shared/environment.h | 97 +
|
|
shared/fft_params.h | 3274 ++++++++++++++++++++++
|
|
shared/fftw_transform.h | 493 ++++
|
|
shared/gpubuf.h | 134 +
|
|
shared/hip_object_wrapper.h | 86 +
|
|
shared/hostbuf.h | 158 ++
|
|
shared/increment.h | 100 +
|
|
shared/precision_type.h | 70 +
|
|
shared/printbuffer.h | 108 +
|
|
shared/ptrdiff.h | 40 +
|
|
shared/rocfft_accuracy_test.h | 29 +
|
|
shared/rocfft_against_fftw.h | 231 ++
|
|
shared/rocfft_complex.h | 346 +++
|
|
shared/rocfft_hip.h | 52 +
|
|
shared/rocfft_params.h | 585 ++++
|
|
shared/test_params.h | 51 +
|
|
shared/work_queue.h | 49 +
|
|
46 files changed, 10966 insertions(+), 66 deletions(-)
|
|
create mode 100644 shared/accuracy_test.h
|
|
create mode 100644 shared/arithmetic.h
|
|
create mode 100644 shared/array_predicate.h
|
|
create mode 100644 shared/array_validator.cpp
|
|
create mode 100644 shared/array_validator.h
|
|
create mode 100644 shared/concurrency.h
|
|
create mode 100644 shared/data_gen_device.h
|
|
create mode 100644 shared/data_gen_host.h
|
|
create mode 100644 shared/device_properties.h
|
|
create mode 100644 shared/enum_to_string.h
|
|
create mode 100644 shared/environment.h
|
|
create mode 100644 shared/fft_params.h
|
|
create mode 100644 shared/fftw_transform.h
|
|
create mode 100644 shared/gpubuf.h
|
|
create mode 100644 shared/hip_object_wrapper.h
|
|
create mode 100644 shared/hostbuf.h
|
|
create mode 100644 shared/increment.h
|
|
create mode 100644 shared/precision_type.h
|
|
create mode 100644 shared/printbuffer.h
|
|
create mode 100644 shared/ptrdiff.h
|
|
create mode 100644 shared/rocfft_accuracy_test.h
|
|
create mode 100644 shared/rocfft_against_fftw.h
|
|
create mode 100644 shared/rocfft_complex.h
|
|
create mode 100644 shared/rocfft_hip.h
|
|
create mode 100644 shared/rocfft_params.h
|
|
create mode 100644 shared/test_params.h
|
|
create mode 100644 shared/work_queue.h
|
|
|
|
diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
|
|
index 1db0d9c..b99a9e5 100644
|
|
--- a/clients/CMakeLists.txt
|
|
+++ b/clients/CMakeLists.txt
|
|
@@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR
|
|
endif()
|
|
|
|
|
|
-if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" )
|
|
- message(STATUS "rocFFT submodule update")
|
|
- execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
|
|
- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT
|
|
- RESULT_VARIABLE GIT_SUBMOD_RESULT)
|
|
- if( NOT GIT_SUBMOD_RESULT EQUAL "0" )
|
|
- message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.")
|
|
- endif( )
|
|
-endif( )
|
|
-
|
|
-if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" )
|
|
- message(FATAL_ERROR "The rocFFT submodule is not present! Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt")
|
|
-endif( )
|
|
-
|
|
-
|
|
# This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on
|
|
# all the time
|
|
# This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim
|
|
diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt
|
|
index b5cef9b..ccb8c29 100644
|
|
--- a/clients/bench/CMakeLists.txt
|
|
+++ b/clients/bench/CMakeLists.txt
|
|
@@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED)
|
|
set( Boost_USE_STATIC_LIBS OFF )
|
|
|
|
|
|
-set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp )
|
|
-set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h )
|
|
+set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp )
|
|
+set( hipfft_bench_includes bench.h ../../shared/array_validator.h )
|
|
|
|
add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} )
|
|
|
|
diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
|
|
index 894769c..a906879 100644
|
|
--- a/clients/bench/bench.cpp
|
|
+++ b/clients/bench/bench.cpp
|
|
@@ -29,7 +29,7 @@
|
|
#include <boost/program_options.hpp>
|
|
namespace po = boost::program_options;
|
|
|
|
-#include "../rocFFT/shared/gpubuf.h"
|
|
+#include "../../shared/gpubuf.h"
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
|
|
index b8b58ac..75d9db9 100644
|
|
--- a/clients/hipfft_params.h
|
|
+++ b/clients/hipfft_params.h
|
|
@@ -23,9 +23,9 @@
|
|
|
|
#include <optional>
|
|
|
|
+#include "../shared/fft_params.h"
|
|
#include "hipfft/hipfft.h"
|
|
#include "hipfft/hipfftXt.h"
|
|
-#include "rocFFT/shared/fft_params.h"
|
|
|
|
inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val)
|
|
{
|
|
diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
|
|
index 9742a45..2d1aac0 100644
|
|
--- a/clients/tests/CMakeLists.txt
|
|
+++ b/clients/tests/CMakeLists.txt
|
|
@@ -37,14 +37,7 @@ set( hipfft-test_source
|
|
accuracy_test_3D.cpp
|
|
accuracy_test_callback.cpp
|
|
multi_device_test.cpp
|
|
- ../rocFFT/shared/array_validator.cpp
|
|
- )
|
|
-
|
|
-set( hipfft-test_includes
|
|
- ../rocFFT/clients/tests/fftw_transform.h
|
|
- ../rocFFT/clients/tests/rocfft_against_fftw.h
|
|
- ../rocFFT/clients/tests/misc/include/test_exception.h
|
|
- ../rocFFT/shared/array_validator.h
|
|
+ ../../shared/array_validator.cpp
|
|
)
|
|
|
|
add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} )
|
|
@@ -56,8 +49,6 @@ target_include_directories(
|
|
$<BUILD_INTERFACE:${FFTW_INCLUDE_DIRS}>
|
|
$<BUILD_INTERFACE:${hip_INCLUDE_DIRS}>
|
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include>
|
|
- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/library/include>
|
|
- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/clients/tests>
|
|
)
|
|
|
|
|
|
diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp
|
|
index 27e849d..57d846a 100644
|
|
--- a/clients/tests/accuracy_test_1D.cpp
|
|
+++ b/clients/tests/accuracy_test_1D.cpp
|
|
@@ -23,11 +23,11 @@
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
|
|
-#include "../rocFFT/shared/fft_params.h"
|
|
+#include "../../shared/fft_params.h"
|
|
|
|
-#include "accuracy_test.h"
|
|
-#include "fftw_transform.h"
|
|
-#include "rocfft_against_fftw.h"
|
|
+#include "../../shared/accuracy_test.h"
|
|
+#include "../../shared/fftw_transform.h"
|
|
+#include "../../shared/rocfft_against_fftw.h"
|
|
|
|
using ::testing::ValuesIn;
|
|
|
|
diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp
|
|
index 1674593..6f618c0 100644
|
|
--- a/clients/tests/accuracy_test_2D.cpp
|
|
+++ b/clients/tests/accuracy_test_2D.cpp
|
|
@@ -23,11 +23,11 @@
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
|
|
-#include "../rocFFT/shared/fft_params.h"
|
|
+#include "../../shared/fft_params.h"
|
|
|
|
-#include "accuracy_test.h"
|
|
-#include "fftw_transform.h"
|
|
-#include "rocfft_against_fftw.h"
|
|
+#include "../../shared/accuracy_test.h"
|
|
+#include "../../shared/fftw_transform.h"
|
|
+#include "../../shared/rocfft_against_fftw.h"
|
|
|
|
using ::testing::ValuesIn;
|
|
|
|
diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
|
|
index a87476a..941ec24 100644
|
|
--- a/clients/tests/accuracy_test_3D.cpp
|
|
+++ b/clients/tests/accuracy_test_3D.cpp
|
|
@@ -23,11 +23,11 @@
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
|
|
-#include "../rocFFT/shared/fft_params.h"
|
|
+#include "../../shared/fft_params.h"
|
|
|
|
-#include "accuracy_test.h"
|
|
-#include "fftw_transform.h"
|
|
-#include "rocfft_against_fftw.h"
|
|
+#include "../../shared/accuracy_test.h"
|
|
+#include "../../shared/fftw_transform.h"
|
|
+#include "../../shared/rocfft_against_fftw.h"
|
|
|
|
using ::testing::ValuesIn;
|
|
|
|
diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp
|
|
index 4782830..b5cc4a7 100644
|
|
--- a/clients/tests/accuracy_test_callback.cpp
|
|
+++ b/clients/tests/accuracy_test_callback.cpp
|
|
@@ -18,7 +18,7 @@
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
-#include "accuracy_test.h"
|
|
+#include "../../shared/accuracy_test.h"
|
|
|
|
std::vector<std::vector<size_t>> callback_sizes = {
|
|
// some single kernel sizes
|
|
diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
|
|
index 1f0ae83..2f7674e 100644
|
|
--- a/clients/tests/gtest_main.cpp
|
|
+++ b/clients/tests/gtest_main.cpp
|
|
@@ -30,10 +30,10 @@
|
|
#include <streambuf>
|
|
#include <string>
|
|
|
|
+#include "../../shared/concurrency.h"
|
|
+#include "../../shared/environment.h"
|
|
+#include "../../shared/work_queue.h"
|
|
#include "../hipfft_params.h"
|
|
-#include "../rocFFT/shared/concurrency.h"
|
|
-#include "../rocFFT/shared/environment.h"
|
|
-#include "../rocFFT/shared/work_queue.h"
|
|
#include "hipfft/hipfft.h"
|
|
#include "hipfft_accuracy_test.h"
|
|
#include "hipfft_test_params.h"
|
|
diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp
|
|
index 2abaf74..609239a 100644
|
|
--- a/clients/tests/hipfft_accuracy_test.cpp
|
|
+++ b/clients/tests/hipfft_accuracy_test.cpp
|
|
@@ -29,11 +29,12 @@
|
|
#include "hipfft/hipfft.h"
|
|
|
|
#include "../hipfft_params.h"
|
|
-#include "../rocFFT/clients/tests/fftw_transform.h"
|
|
-#include "../rocFFT/clients/tests/rocfft_accuracy_test.h"
|
|
-#include "../rocFFT/clients/tests/rocfft_against_fftw.h"
|
|
-#include "../rocFFT/shared/gpubuf.h"
|
|
-#include "../rocFFT/shared/rocfft_complex.h"
|
|
+
|
|
+#include "../../shared/accuracy_test.h"
|
|
+#include "../../shared/fftw_transform.h"
|
|
+#include "../../shared/gpubuf.h"
|
|
+#include "../../shared/rocfft_against_fftw.h"
|
|
+#include "../../shared/rocfft_complex.h"
|
|
|
|
void fft_vs_reference(hipfft_params& params, bool round_trip)
|
|
{
|
|
diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h
|
|
index 0491bd9..181150e 100644
|
|
--- a/clients/tests/hipfft_accuracy_test.h
|
|
+++ b/clients/tests/hipfft_accuracy_test.h
|
|
@@ -23,8 +23,8 @@
|
|
#ifndef ROCFFT_ACCURACY_TEST
|
|
#define ROCFFT_ACCURACY_TEST
|
|
|
|
+#include "../../shared/accuracy_test.h"
|
|
#include "../hipfft_params.h"
|
|
-#include "../rocFFT/clients/tests/accuracy_test.h"
|
|
|
|
void fft_vs_reference(hipfft_params& params, bool round_trip = false);
|
|
|
|
diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
|
|
index b3dc4c9..3274b80 100644
|
|
--- a/clients/tests/multi_device_test.cpp
|
|
+++ b/clients/tests/multi_device_test.cpp
|
|
@@ -18,7 +18,7 @@
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
-#include "accuracy_test.h"
|
|
+#include "../../shared/accuracy_test.h"
|
|
#include <gtest/gtest.h>
|
|
#include <hip/hip_runtime_api.h>
|
|
|
|
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
|
|
index 5810e37..bdbf689 100644
|
|
--- a/cmake/dependencies.cmake
|
|
+++ b/cmake/dependencies.cmake
|
|
@@ -21,9 +21,6 @@
|
|
#
|
|
# #############################################################################
|
|
|
|
-# Git
|
|
-find_package(Git REQUIRED)
|
|
-
|
|
# HIP
|
|
if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
|
|
if( NOT BUILD_WITH_LIB STREQUAL "CUDA" )
|
|
diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp
|
|
index c2f7036..3d4f61f 100644
|
|
--- a/library/src/amd_detail/hipfft.cpp
|
|
+++ b/library/src/amd_detail/hipfft.cpp
|
|
@@ -27,10 +27,10 @@
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
-#include "../../../clients/rocFFT/shared/arithmetic.h"
|
|
-#include "../../../clients/rocFFT/shared/gpubuf.h"
|
|
-#include "../../../clients/rocFFT/shared/ptrdiff.h"
|
|
-#include "../../../clients/rocFFT/shared/rocfft_hip.h"
|
|
+#include "../../../shared/arithmetic.h"
|
|
+#include "../../../shared/gpubuf.h"
|
|
+#include "../../../shared/ptrdiff.h"
|
|
+#include "../../../shared/rocfft_hip.h"
|
|
|
|
#define ROC_FFT_CHECK_ALLOC_FAILED(ret) \
|
|
{ \
|
|
diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h
|
|
new file mode 100644
|
|
index 0000000..362a7c1
|
|
--- /dev/null
|
|
+++ b/shared/accuracy_test.h
|
|
@@ -0,0 +1,1949 @@
|
|
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+
|
|
+#ifndef ACCURACY_TEST
|
|
+#define ACCURACY_TEST
|
|
+
|
|
+#include <algorithm>
|
|
+#include <functional>
|
|
+#include <future>
|
|
+#include <iterator>
|
|
+#include <string>
|
|
+#include <vector>
|
|
+
|
|
+#include "enum_to_string.h"
|
|
+#include "fft_params.h"
|
|
+#include "fftw_transform.h"
|
|
+#include "gpubuf.h"
|
|
+#include "rocfft_against_fftw.h"
|
|
+#include "test_params.h"
|
|
+
|
|
+extern int verbose;
|
|
+extern size_t ramgb;
|
|
+extern bool fftw_compare;
|
|
+
|
|
+static const size_t ONE_GiB = 1 << 30;
|
|
+
|
|
+inline size_t bytes_to_GiB(const size_t bytes)
|
|
+{
|
|
+ return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB;
|
|
+}
|
|
+
|
|
+typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type>
|
|
+ type_place_io_t;
|
|
+
|
|
+// Remember the results of the last FFT we computed with FFTW. Tests
|
|
+// are ordered so that later cases can often reuse this result.
|
|
+struct last_cpu_fft_cache
|
|
+{
|
|
+ // keys to the cache
|
|
+ std::vector<size_t> length;
|
|
+ size_t nbatch = 0;
|
|
+ fft_transform_type transform_type = fft_transform_type_complex_forward;
|
|
+ bool run_callbacks = false;
|
|
+ fft_precision precision = fft_precision_single;
|
|
+
|
|
+ // FFTW input/output
|
|
+ std::vector<hostbuf> cpu_input;
|
|
+ std::vector<hostbuf> cpu_output;
|
|
+};
|
|
+extern last_cpu_fft_cache last_cpu_fft_data;
|
|
+
|
|
+struct system_memory
|
|
+{
|
|
+ size_t total_bytes = 0;
|
|
+ size_t free_bytes = 0;
|
|
+};
|
|
+extern system_memory start_memory;
|
|
+
|
|
+system_memory get_system_memory();
|
|
+
|
|
+// Estimate the amount of host memory needed for buffers.
|
|
+inline size_t needed_ram_buffers(const fft_params& params, const int verbose)
|
|
+{
|
|
+ // This calculation is assuming contiguous data but noncontiguous buffers
|
|
+ // are assumed to require a close enough amount of space for the purposes
|
|
+ // of this estimate.
|
|
+
|
|
+ size_t needed_ram = 6
|
|
+ * std::accumulate(params.length.begin(),
|
|
+ params.length.end(),
|
|
+ static_cast<size_t>(1),
|
|
+ std::multiplies<size_t>());
|
|
+
|
|
+ // Account for precision and data type:
|
|
+ if(params.transform_type != fft_transform_type_real_forward
|
|
+ && params.transform_type != fft_transform_type_real_inverse)
|
|
+ {
|
|
+ needed_ram *= 2;
|
|
+ }
|
|
+ switch(params.precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ needed_ram *= 2;
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ needed_ram *= 4;
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ needed_ram *= 8;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ needed_ram *= params.nbatch;
|
|
+
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n";
|
|
+ }
|
|
+
|
|
+ return needed_ram;
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan)
|
|
+{
|
|
+#ifdef FFTW_HAVE_SPRINT_PLAN
|
|
+ char* print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan);
|
|
+ std::string print_plan(print_plan_c_str);
|
|
+ free(print_plan_c_str);
|
|
+ return print_plan.find("bluestein") != std::string::npos;
|
|
+#else
|
|
+ // assume worst case (bluestein is always used)
|
|
+ return true;
|
|
+#endif
|
|
+}
|
|
+
|
|
+// Estimate the amount of host memory needed for fftw.
|
|
+template <typename Tfloat>
|
|
+inline size_t needed_ram_fftw(const fft_params& contiguous_params,
|
|
+ const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
|
|
+ const int verbose)
|
|
+{
|
|
+ size_t total_length = std::accumulate(contiguous_params.length.begin(),
|
|
+ contiguous_params.length.end(),
|
|
+ static_cast<size_t>(1),
|
|
+ std::multiplies<size_t>());
|
|
+ size_t needed_ram = 0;
|
|
+ // Detect Bluestein in plan
|
|
+ if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan))
|
|
+ {
|
|
+ for(size_t dim : contiguous_params.length)
|
|
+ {
|
|
+ unsigned int needed_ram_dim = dim;
|
|
+
|
|
+ // Next-plus-one-power-of-two multiplied any other lengths
|
|
+ needed_ram_dim--;
|
|
+
|
|
+ needed_ram_dim |= needed_ram_dim >> 2;
|
|
+ needed_ram_dim |= needed_ram_dim >> 4;
|
|
+ needed_ram_dim |= needed_ram_dim >> 8;
|
|
+ needed_ram_dim |= needed_ram_dim >> 16;
|
|
+
|
|
+ needed_ram_dim++;
|
|
+
|
|
+ needed_ram_dim *= 2 * (total_length / dim);
|
|
+
|
|
+ if(needed_ram_dim > needed_ram)
|
|
+ {
|
|
+ needed_ram = needed_ram_dim;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Account for precision and data type:
|
|
+ if(contiguous_params.transform_type != fft_transform_type_real_forward
|
|
+ && contiguous_params.transform_type != fft_transform_type_real_inverse)
|
|
+ {
|
|
+ needed_ram *= 2;
|
|
+ }
|
|
+ switch(contiguous_params.precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ needed_ram *= 2;
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ needed_ram *= 4;
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ needed_ram *= 8;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ needed_ram *= contiguous_params.nbatch;
|
|
+
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n";
|
|
+ }
|
|
+
|
|
+ return needed_ram;
|
|
+}
|
|
+
|
|
+// Base gtest class for comparison with FFTW.
|
|
+class accuracy_test : public ::testing::TestWithParam<fft_params>
|
|
+{
|
|
+protected:
|
|
+ void SetUp() override {}
|
|
+ void TearDown() override {}
|
|
+
|
|
+public:
|
|
+ static std::string TestName(const testing::TestParamInfo<accuracy_test::ParamType>& info)
|
|
+ {
|
|
+ return info.param.token();
|
|
+ }
|
|
+};
|
|
+
|
|
+const static std::vector<size_t> batch_range = {2, 1};
|
|
+
|
|
+const static std::vector<fft_precision> precision_range_full
|
|
+ = {fft_precision_double, fft_precision_single, fft_precision_half};
|
|
+const static std::vector<fft_precision> precision_range_sp_dp
|
|
+ = {fft_precision_double, fft_precision_single};
|
|
+
|
|
+const static std::vector<fft_result_placement> place_range
|
|
+ = {fft_placement_inplace, fft_placement_notinplace};
|
|
+const static std::vector<fft_transform_type> trans_type_range
|
|
+ = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
|
|
+const static std::vector<fft_transform_type> trans_type_range_complex
|
|
+ = {fft_transform_type_complex_forward};
|
|
+const static std::vector<fft_transform_type> trans_type_range_real
|
|
+ = {fft_transform_type_real_forward};
|
|
+
|
|
+// Given a vector of vector of lengths, generate all unique permutations.
|
|
+// Add an optional vector of ad-hoc lengths to the result.
|
|
+inline std::vector<std::vector<size_t>>
|
|
+ generate_lengths(const std::vector<std::vector<size_t>>& inlengths)
|
|
+{
|
|
+ std::vector<std::vector<size_t>> output;
|
|
+ if(inlengths.size() == 0)
|
|
+ {
|
|
+ return output;
|
|
+ }
|
|
+ const size_t dim = inlengths.size();
|
|
+ std::vector<size_t> looplength(dim);
|
|
+ for(unsigned int i = 0; i < dim; ++i)
|
|
+ {
|
|
+ looplength[i] = inlengths[i].size();
|
|
+ }
|
|
+ for(unsigned int idx = 0; idx < inlengths.size(); ++idx)
|
|
+ {
|
|
+ std::vector<size_t> index(dim);
|
|
+ do
|
|
+ {
|
|
+ std::vector<size_t> length(dim);
|
|
+ for(unsigned int i = 0; i < dim; ++i)
|
|
+ {
|
|
+ length[i] = inlengths[i][index[i]];
|
|
+ }
|
|
+ output.push_back(length);
|
|
+ } while(increment_rowmajor(index, looplength));
|
|
+ }
|
|
+ // uniquify the result
|
|
+ std::sort(output.begin(), output.end());
|
|
+ output.erase(std::unique(output.begin(), output.end()), output.end());
|
|
+ return output;
|
|
+}
|
|
+
|
|
+// Return the valid rocFFT input and output types for a given transform type.
|
|
+inline std::vector<std::pair<fft_array_type, fft_array_type>>
|
|
+ iotypes(const fft_transform_type transformType,
|
|
+ const fft_result_placement place,
|
|
+ const bool planar = true)
|
|
+{
|
|
+ std::vector<std::pair<fft_array_type, fft_array_type>> iotypes;
|
|
+ switch(transformType)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_complex_interleaved, fft_array_type_complex_interleaved));
|
|
+ if(planar)
|
|
+ {
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_complex_planar, fft_array_type_complex_planar));
|
|
+ if(place == fft_placement_notinplace)
|
|
+ {
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_complex_planar, fft_array_type_complex_interleaved));
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_complex_interleaved, fft_array_type_complex_planar));
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ case fft_transform_type_real_forward:
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_real, fft_array_type_hermitian_interleaved));
|
|
+ if(planar && place == fft_placement_notinplace)
|
|
+ {
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_real, fft_array_type_hermitian_planar));
|
|
+ }
|
|
+ break;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_hermitian_interleaved, fft_array_type_real));
|
|
+ if(planar && place == fft_placement_notinplace)
|
|
+ {
|
|
+ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
|
|
+ fft_array_type_hermitian_planar, fft_array_type_real));
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+ return iotypes;
|
|
+}
|
|
+
|
|
+// Generate all combinations of input/output types, from combinations of transform and placement
|
|
+// types.
|
|
+static std::vector<type_place_io_t>
|
|
+ generate_types(fft_transform_type transform_type,
|
|
+ const std::vector<fft_result_placement>& place_range,
|
|
+ const bool planar)
|
|
+{
|
|
+ std::vector<type_place_io_t> ret;
|
|
+ for(auto place : place_range)
|
|
+ {
|
|
+ for(auto iotype : iotypes(transform_type, place, planar))
|
|
+ {
|
|
+ ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second));
|
|
+ }
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct stride_generator
|
|
+{
|
|
+ struct stride_dist
|
|
+ {
|
|
+ stride_dist(const std::vector<size_t>& s, size_t d)
|
|
+ : stride(s)
|
|
+ , dist(d)
|
|
+ {
|
|
+ }
|
|
+ std::vector<size_t> stride;
|
|
+ size_t dist;
|
|
+ };
|
|
+
|
|
+ // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer
|
|
+ //
|
|
+ // cppcheck-suppress noExplicitConstructor
|
|
+ stride_generator(const std::vector<std::vector<size_t>>& stride_list_in)
|
|
+ : stride_list(stride_list_in)
|
|
+ {
|
|
+ }
|
|
+ virtual std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
|
|
+ size_t batch) const
|
|
+ {
|
|
+ std::vector<stride_dist> ret;
|
|
+ for(const auto& s : stride_list)
|
|
+ ret.emplace_back(s, 0);
|
|
+ return ret;
|
|
+ }
|
|
+ std::vector<std::vector<size_t>> stride_list;
|
|
+};
|
|
+
|
|
+// Generate strides such that batch is essentially the innermost dimension
|
|
+// e.g. given a batch-2 4x3x2 transform which logically looks like:
|
|
+//
|
|
+// batch0:
|
|
+// A B A B
|
|
+// A B A B
|
|
+// A B A B
|
|
+//
|
|
+// A B A B
|
|
+// A B A B
|
|
+// A B A B
|
|
+//
|
|
+// batch1:
|
|
+// A B A B
|
|
+// A B A B
|
|
+// A B A B
|
|
+//
|
|
+// A B A B
|
|
+// A B A B
|
|
+// A B A B
|
|
+//
|
|
+// we instead do stride-2 4x3x2 transform where first batch is the
|
|
+// A's and second batch is the B's.
|
|
+struct stride_generator_3D_inner_batch : public stride_generator
|
|
+{
|
|
+ explicit stride_generator_3D_inner_batch(const std::vector<std::vector<size_t>>& stride_list_in)
|
|
+ : stride_generator(stride_list_in)
|
|
+ {
|
|
+ }
|
|
+ std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
|
|
+ size_t batch) const override
|
|
+ {
|
|
+ std::vector<stride_dist> ret = stride_generator::generate(lengths, batch);
|
|
+ std::vector<size_t> strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch};
|
|
+ ret.emplace_back(strides, 1);
|
|
+ return ret;
|
|
+ }
|
|
+};
|
|
+
|
|
+// Create an array of parameters to pass to gtest. Base generator
|
|
+// that allows choosing transform type.
|
|
+inline auto param_generator_base(const std::vector<fft_transform_type>& type_range,
|
|
+ const std::vector<std::vector<size_t>>& v_lengths,
|
|
+ const std::vector<fft_precision>& precision_range,
|
|
+ const std::vector<size_t>& batch_range,
|
|
+ decltype(generate_types) types_generator,
|
|
+ const stride_generator& istride,
|
|
+ const stride_generator& ostride,
|
|
+ const std::vector<std::vector<size_t>>& ioffset_range,
|
|
+ const std::vector<std::vector<size_t>>& ooffset_range,
|
|
+ const std::vector<fft_result_placement>& place_range,
|
|
+ const bool planar = true,
|
|
+ const bool run_callbacks = false)
|
|
+{
|
|
+
|
|
+ std::vector<fft_params> params;
|
|
+
|
|
+ // For any length, we compute double-precision CPU reference
|
|
+ // for largest batch size first and reuse for smaller batch
|
|
+ // sizes, then convert to single-precision.
|
|
+
|
|
+ for(auto& transform_type : type_range)
|
|
+ {
|
|
+ for(const auto& lengths : v_lengths)
|
|
+ {
|
|
+ // try to ensure that we are given literal lengths, not
|
|
+ // something to be passed to generate_lengths
|
|
+ if(lengths.empty() || lengths.size() > 3)
|
|
+ {
|
|
+ continue;
|
|
+ }
|
|
+ {
|
|
+ for(const auto precision : precision_range)
|
|
+ {
|
|
+ for(const auto batch : batch_range)
|
|
+ {
|
|
+ for(const auto& types :
|
|
+ types_generator(transform_type, place_range, planar))
|
|
+ {
|
|
+ for(const auto& istride_dist : istride.generate(lengths, batch))
|
|
+ {
|
|
+ for(const auto& ostride_dist : ostride.generate(lengths, batch))
|
|
+ {
|
|
+ for(const auto& ioffset : ioffset_range)
|
|
+ {
|
|
+ for(const auto& ooffset : ooffset_range)
|
|
+ {
|
|
+ fft_params param;
|
|
+
|
|
+ param.length = lengths;
|
|
+ param.istride = istride_dist.stride;
|
|
+ param.ostride = ostride_dist.stride;
|
|
+ param.nbatch = batch;
|
|
+ param.precision = precision;
|
|
+ param.transform_type = std::get<0>(types);
|
|
+ param.placement = std::get<1>(types);
|
|
+ param.idist = istride_dist.dist;
|
|
+ param.odist = ostride_dist.dist;
|
|
+ param.itype = std::get<2>(types);
|
|
+ param.otype = std::get<3>(types);
|
|
+ param.ioffset = ioffset;
|
|
+ param.ooffset = ooffset;
|
|
+
|
|
+ if(run_callbacks)
|
|
+ {
|
|
+ // add a test if both input and output support callbacks
|
|
+ if(param.itype != fft_array_type_complex_planar
|
|
+ && param.itype != fft_array_type_hermitian_planar
|
|
+ && param.otype != fft_array_type_complex_planar
|
|
+ && param.otype
|
|
+ != fft_array_type_hermitian_planar)
|
|
+ {
|
|
+ param.run_callbacks = true;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+ param.validate();
|
|
+
|
|
+ // Keeping the random number generator here
|
|
+ // allows one to run the same tests for a given
|
|
+ // random seed; ie the test suite is repeatable.
|
|
+ std::hash<std::string> hasher;
|
|
+ std::ranlux24_base gen(random_seed
|
|
+ + hasher(param.token()));
|
|
+ std::uniform_real_distribution<> dis(0.0, 1.0);
|
|
+
|
|
+ if(param.is_planar())
|
|
+ {
|
|
+ const double roll = dis(gen);
|
|
+ if(roll > planar_prob)
|
|
+ {
|
|
+ if(verbose > 4)
|
|
+ {
|
|
+ std::cout << "Planar transform skipped "
|
|
+ "(planar_prob: "
|
|
+ << planar_prob << " > " << roll
|
|
+ << ")\n";
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+ if(run_callbacks)
|
|
+ {
|
|
+ const double roll = dis(gen);
|
|
+ if(roll > callback_prob)
|
|
+ {
|
|
+
|
|
+ if(verbose > 4)
|
|
+ {
|
|
+ std::cout << "Callback transform skipped "
|
|
+ "(planar_prob: "
|
|
+ << planar_prob << " > " << roll
|
|
+ << ")\n";
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(param.valid(0))
|
|
+ {
|
|
+ params.push_back(param);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return params;
|
|
+}
|
|
+
|
|
+// Create an array of parameters to pass to gtest. Default generator
|
|
+// that picks all transform types.
|
|
+inline auto param_generator(const std::vector<std::vector<size_t>>& v_lengths,
|
|
+ const std::vector<fft_precision>& precision_range,
|
|
+ const std::vector<size_t>& batch_range,
|
|
+ const stride_generator& istride,
|
|
+ const stride_generator& ostride,
|
|
+ const std::vector<std::vector<size_t>>& ioffset_range,
|
|
+ const std::vector<std::vector<size_t>>& ooffset_range,
|
|
+ const std::vector<fft_result_placement>& place_range,
|
|
+ const bool planar,
|
|
+ const bool run_callbacks = false)
|
|
+{
|
|
+ return param_generator_base(trans_type_range,
|
|
+ v_lengths,
|
|
+ precision_range,
|
|
+ batch_range,
|
|
+ generate_types,
|
|
+ istride,
|
|
+ ostride,
|
|
+ ioffset_range,
|
|
+ ooffset_range,
|
|
+ place_range,
|
|
+ planar,
|
|
+ run_callbacks);
|
|
+}
|
|
+
|
|
+// Create an array of parameters to pass to gtest. Only tests complex-type transforms
|
|
+inline auto param_generator_complex(const std::vector<std::vector<size_t>>& v_lengths,
|
|
+ const std::vector<fft_precision>& precision_range,
|
|
+ const std::vector<size_t>& batch_range,
|
|
+ const stride_generator& istride,
|
|
+ const stride_generator& ostride,
|
|
+ const std::vector<std::vector<size_t>>& ioffset_range,
|
|
+ const std::vector<std::vector<size_t>>& ooffset_range,
|
|
+ const std::vector<fft_result_placement>& place_range,
|
|
+ const bool planar,
|
|
+ const bool run_callbacks = false)
|
|
+{
|
|
+ return param_generator_base(trans_type_range_complex,
|
|
+ v_lengths,
|
|
+ precision_range,
|
|
+ batch_range,
|
|
+ generate_types,
|
|
+ istride,
|
|
+ ostride,
|
|
+ ioffset_range,
|
|
+ ooffset_range,
|
|
+ place_range,
|
|
+ planar,
|
|
+ run_callbacks);
|
|
+}
|
|
+
|
|
+// Create an array of parameters to pass to gtest.
|
|
+inline auto param_generator_real(const std::vector<std::vector<size_t>>& v_lengths,
|
|
+ const std::vector<fft_precision>& precision_range,
|
|
+ const std::vector<size_t>& batch_range,
|
|
+ const stride_generator& istride,
|
|
+ const stride_generator& ostride,
|
|
+ const std::vector<std::vector<size_t>>& ioffset_range,
|
|
+ const std::vector<std::vector<size_t>>& ooffset_range,
|
|
+ const std::vector<fft_result_placement>& place_range,
|
|
+ const bool planar,
|
|
+ const bool run_callbacks = false)
|
|
+{
|
|
+ return param_generator_base(trans_type_range_real,
|
|
+ v_lengths,
|
|
+ precision_range,
|
|
+ batch_range,
|
|
+ generate_types,
|
|
+ istride,
|
|
+ ostride,
|
|
+ ioffset_range,
|
|
+ ooffset_range,
|
|
+ place_range,
|
|
+ planar,
|
|
+ run_callbacks);
|
|
+}
|
|
+
|
|
+template <class Tcontainer>
|
|
+auto param_generator_token(const Tcontainer& tokens)
|
|
+{
|
|
+ std::vector<fft_params> params;
|
|
+ params.reserve(tokens.size());
|
|
+ for(auto t : tokens)
|
|
+ {
|
|
+ params.push_back({});
|
|
+ params.back().from_token(t);
|
|
+ }
|
|
+ return params;
|
|
+}
|
|
+
|
|
+struct callback_test_data
|
|
+{
|
|
+ // scalar to modify the input/output with
|
|
+ double scalar;
|
|
+ // base address of input, to ensure that each callback gets an offset from that base
|
|
+ void* base;
|
|
+};
|
|
+
|
|
+void* get_load_callback_host(fft_array_type itype,
|
|
+ fft_precision precision,
|
|
+ bool round_trip_inverse);
|
|
+void apply_load_callback(const fft_params& params, std::vector<hostbuf>& input);
|
|
+void apply_store_callback(const fft_params& params, std::vector<hostbuf>& output);
|
|
+void* get_store_callback_host(fft_array_type otype,
|
|
+ fft_precision precision,
|
|
+ bool round_trip_inverse);
|
|
+
|
|
+static auto allocate_cpu_fft_buffer(const fft_precision precision,
|
|
+ const fft_array_type type,
|
|
+ const std::vector<size_t>& size)
|
|
+{
|
|
+ // FFTW does not support half-precision, so we do single instead.
|
|
+ // So if we need to do a half-precision FFTW transform, allocate
|
|
+ // enough buffer for single-precision instead.
|
|
+ return allocate_host_buffer(
|
|
+ precision == fft_precision_half ? fft_precision_single : precision, type, size);
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+inline void execute_cpu_fft(fft_params& params,
|
|
+ fft_params& contiguous_params,
|
|
+ typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
|
|
+ std::vector<hostbuf>& cpu_input,
|
|
+ std::vector<hostbuf>& cpu_output)
|
|
+{
|
|
+ // CPU output might not be allocated already for us, if FFTW never
|
|
+ // needed an output buffer during planning
|
|
+ if(cpu_output.empty())
|
|
+ cpu_output = allocate_cpu_fft_buffer(
|
|
+ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
|
|
+
|
|
+ // If this is either C2R or callbacks are enabled, the
|
|
+ // input will be modified. So we need to modify the copy instead.
|
|
+ std::vector<hostbuf> cpu_input_copy(cpu_input.size());
|
|
+ std::vector<hostbuf>* input_ptr = &cpu_input;
|
|
+ if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse)
|
|
+ {
|
|
+ for(size_t i = 0; i < cpu_input.size(); ++i)
|
|
+ {
|
|
+ cpu_input_copy[i] = cpu_input[i].copy();
|
|
+ }
|
|
+
|
|
+ input_ptr = &cpu_input_copy;
|
|
+ }
|
|
+
|
|
+ // run FFTW (which may destroy CPU input)
|
|
+ apply_load_callback(params, *input_ptr);
|
|
+ fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output);
|
|
+ // clean up
|
|
+ fftw_destroy_plan_type(cpu_plan);
|
|
+ // ask FFTW to fully clean up, since it tries to cache plan details
|
|
+ fftw_cleanup();
|
|
+ cpu_plan = nullptr;
|
|
+ apply_store_callback(params, cpu_output);
|
|
+}
|
|
+
|
|
+// execute the GPU transform
|
|
+template <class Tparams>
|
|
+inline void execute_gpu_fft(Tparams& params,
|
|
+ std::vector<void*>& pibuffer,
|
|
+ std::vector<void*>& pobuffer,
|
|
+ std::vector<gpubuf>& obuffer,
|
|
+ std::vector<hostbuf>& gpu_output,
|
|
+ bool round_trip_inverse = false)
|
|
+{
|
|
+ gpubuf_t<callback_test_data> load_cb_data_dev;
|
|
+ gpubuf_t<callback_test_data> store_cb_data_dev;
|
|
+ if(params.run_callbacks)
|
|
+ {
|
|
+ void* load_cb_host
|
|
+ = get_load_callback_host(params.itype, params.precision, round_trip_inverse);
|
|
+
|
|
+ callback_test_data load_cb_data_host;
|
|
+
|
|
+ if(round_trip_inverse)
|
|
+ {
|
|
+ load_cb_data_host.scalar = params.store_cb_scalar;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ load_cb_data_host.scalar = params.load_cb_scalar;
|
|
+ }
|
|
+
|
|
+ load_cb_data_host.base = pibuffer.front();
|
|
+
|
|
+ auto hip_status = hipSuccess;
|
|
+
|
|
+ hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data));
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL();
|
|
+ }
|
|
+ }
|
|
+ hip_status = hipMemcpy(load_cb_data_dev.data(),
|
|
+ &load_cb_data_host,
|
|
+ sizeof(callback_test_data),
|
|
+ hipMemcpyHostToDevice);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ void* store_cb_host
|
|
+ = get_store_callback_host(params.otype, params.precision, round_trip_inverse);
|
|
+
|
|
+ callback_test_data store_cb_data_host;
|
|
+
|
|
+ if(round_trip_inverse)
|
|
+ {
|
|
+ store_cb_data_host.scalar = params.load_cb_scalar;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ store_cb_data_host.scalar = params.store_cb_scalar;
|
|
+ }
|
|
+
|
|
+ store_cb_data_host.base = pobuffer.front();
|
|
+
|
|
+ hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data));
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ hip_status = hipMemcpy(store_cb_data_dev.data(),
|
|
+ &store_cb_data_host,
|
|
+ sizeof(callback_test_data),
|
|
+ hipMemcpyHostToDevice);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ auto fft_status = params.set_callbacks(
|
|
+ load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data());
|
|
+ if(fft_status != fft_status_success)
|
|
+ throw std::runtime_error("set callback failure");
|
|
+ }
|
|
+
|
|
+ // Execute the transform:
|
|
+ auto fft_status = params.execute(pibuffer.data(), pobuffer.data());
|
|
+ if(fft_status != fft_status_success)
|
|
+ throw std::runtime_error("rocFFT plan execution failure");
|
|
+
|
|
+ // if not comparing, then just executing the GPU FFT is all we
|
|
+ // need to do
|
|
+ if(!fftw_compare)
|
|
+ return;
|
|
+
|
|
+ // finalize a multi-GPU transform
|
|
+ params.multi_gpu_finalize(obuffer, pobuffer);
|
|
+
|
|
+ ASSERT_TRUE(!gpu_output.empty()) << "no output buffers";
|
|
+ for(unsigned int idx = 0; idx < gpu_output.size(); ++idx)
|
|
+ {
|
|
+ ASSERT_TRUE(gpu_output[idx].data() != nullptr)
|
|
+ << "output buffer index " << idx << " is empty";
|
|
+ auto hip_status = hipMemcpy(gpu_output[idx].data(),
|
|
+ pobuffer.at(idx),
|
|
+ gpu_output[idx].size(),
|
|
+ hipMemcpyDeviceToHost);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << "hipMemcpy failure";
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << "hipMemcpy failure";
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ std::cout << "GPU output:\n";
|
|
+ params.print_obuffer(gpu_output);
|
|
+ }
|
|
+ if(verbose > 5)
|
|
+ {
|
|
+ std::cout << "flat GPU output:\n";
|
|
+ params.print_obuffer_flat(gpu_output);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+static void assert_init_value(const std::vector<hostbuf>& output,
|
|
+ const size_t idx,
|
|
+ const Tfloat orig_value);
|
|
+
|
|
+template <>
|
|
+void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value)
|
|
+{
|
|
+ float actual_value = reinterpret_cast<const float*>(output.front().data())[idx];
|
|
+ ASSERT_EQ(actual_value, orig_value) << "index " << idx;
|
|
+}
|
|
+
|
|
+template <>
|
|
+void assert_init_value(const std::vector<hostbuf>& output,
|
|
+ const size_t idx,
|
|
+ const double orig_value)
|
|
+{
|
|
+ double actual_value = reinterpret_cast<const double*>(output.front().data())[idx];
|
|
+ ASSERT_EQ(actual_value, orig_value) << "index " << idx;
|
|
+}
|
|
+
|
|
+template <>
|
|
+void assert_init_value(const std::vector<hostbuf>& output,
|
|
+ const size_t idx,
|
|
+ const rocfft_complex<float> orig_value)
|
|
+{
|
|
+ // if this is interleaved, check directly
|
|
+ if(output.size() == 1)
|
|
+ {
|
|
+ rocfft_complex<float> actual_value
|
|
+ = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx];
|
|
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
|
|
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ // planar
|
|
+ rocfft_complex<float> actual_value{
|
|
+ reinterpret_cast<const float*>(output.front().data())[idx],
|
|
+ reinterpret_cast<const float*>(output.back().data())[idx]};
|
|
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
|
|
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
|
|
+ }
|
|
+}
|
|
+
|
|
+template <>
|
|
+void assert_init_value(const std::vector<hostbuf>& output,
|
|
+ const size_t idx,
|
|
+ const rocfft_complex<double> orig_value)
|
|
+{
|
|
+ // if this is interleaved, check directly
|
|
+ if(output.size() == 1)
|
|
+ {
|
|
+ rocfft_complex<double> actual_value
|
|
+ = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx];
|
|
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
|
|
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ // planar
|
|
+ rocfft_complex<double> actual_value{
|
|
+ reinterpret_cast<const double*>(output.front().data())[idx],
|
|
+ reinterpret_cast<const double*>(output.back().data())[idx]};
|
|
+ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
|
|
+ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
|
|
+ }
|
|
+}
|
|
+
|
|
+static const int OUTPUT_INIT_PATTERN = 0xcd;
|
|
+template <class Tfloat>
|
|
+void check_single_output_stride(const std::vector<hostbuf>& output,
|
|
+ const size_t offset,
|
|
+ const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& stride,
|
|
+ const size_t i)
|
|
+{
|
|
+ Tfloat orig;
|
|
+ memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat));
|
|
+
|
|
+ size_t curLength = length[i];
|
|
+ size_t curStride = stride[i];
|
|
+ size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1];
|
|
+ size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1];
|
|
+
|
|
+ if(nextSmallerLength == 0)
|
|
+ {
|
|
+ // this is the fastest dim, indexes that are not multiples of
|
|
+ // the stride should be the initial value
|
|
+ for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx)
|
|
+ {
|
|
+ if(idx % curStride != 0)
|
|
+ assert_init_value<Tfloat>(output, idx, orig);
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx)
|
|
+ {
|
|
+ // check that the space after the next smaller dim and the
|
|
+ // end of this dim is initial value
|
|
+ for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx)
|
|
+ assert_init_value<Tfloat>(output, idx, orig);
|
|
+
|
|
+ check_single_output_stride<Tfloat>(
|
|
+ output, offset + lengthIdx * curStride, length, stride, i + 1);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <class Tparams>
|
|
+void check_output_strides(const std::vector<hostbuf>& output, Tparams& params)
|
|
+{
|
|
+ // treat batch+dist like highest length+stride, if batch > 1
|
|
+ std::vector<size_t> length;
|
|
+ std::vector<size_t> stride;
|
|
+ if(params.nbatch > 1)
|
|
+ {
|
|
+ length.push_back(params.nbatch);
|
|
+ stride.push_back(params.odist);
|
|
+ }
|
|
+
|
|
+ auto olength = params.olength();
|
|
+ std::copy(olength.begin(), olength.end(), std::back_inserter(length));
|
|
+ std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride));
|
|
+
|
|
+ if(params.precision == fft_precision_single)
|
|
+ {
|
|
+ if(params.otype == fft_array_type_real)
|
|
+ check_single_output_stride<float>(output, 0, length, stride, 0);
|
|
+ else
|
|
+ check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if(params.otype == fft_array_type_real)
|
|
+ check_single_output_stride<double>(output, 0, length, stride, 0);
|
|
+ else
|
|
+ check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+// run rocFFT inverse transform
|
|
+template <class Tparams>
|
|
+inline void run_round_trip_inverse(Tparams& params,
|
|
+ std::vector<gpubuf>& obuffer,
|
|
+ std::vector<void*>& pibuffer,
|
|
+ std::vector<void*>& pobuffer,
|
|
+ std::vector<hostbuf>& gpu_output)
|
|
+{
|
|
+ params.validate();
|
|
+
|
|
+ // Make sure that the parameters make sense:
|
|
+ ASSERT_TRUE(params.valid(verbose));
|
|
+
|
|
+ // Create FFT plan - this will also allocate work buffer, but will throw a
|
|
+ // specific exception if that step fails
|
|
+ auto plan_status = fft_status_success;
|
|
+ try
|
|
+ {
|
|
+ plan_status = params.create_plan();
|
|
+ }
|
|
+ catch(fft_params::work_buffer_alloc_failure& e)
|
|
+ {
|
|
+ std::stringstream ss;
|
|
+ ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << ss.str();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << ss.str();
|
|
+ }
|
|
+ }
|
|
+ ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed";
|
|
+
|
|
+ auto obuffer_sizes = params.obuffer_sizes();
|
|
+
|
|
+ if(params.placement != fft_placement_inplace)
|
|
+ {
|
|
+ for(unsigned int i = 0; i < obuffer_sizes.size(); ++i)
|
|
+ {
|
|
+ // If we're validating output strides, init the
|
|
+ // output buffer to a known pattern and we can check
|
|
+ // that the pattern is untouched in places that
|
|
+ // shouldn't have been touched.
|
|
+ if(params.check_output_strides)
|
|
+ {
|
|
+ auto hip_status
|
|
+ = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << "hipMemset failure";
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << "hipMemset failure";
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // execute GPU transform
|
|
+ execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true);
|
|
+}
|
|
+
|
|
+// compare rocFFT inverse transform with forward transform input
|
|
+template <class Tparams>
|
|
+inline void compare_round_trip_inverse(Tparams& params,
|
|
+ fft_params& contiguous_params,
|
|
+ std::vector<hostbuf>& gpu_output,
|
|
+ std::vector<hostbuf>& cpu_input,
|
|
+ const VectorNorms& cpu_input_norm,
|
|
+ size_t total_length)
|
|
+{
|
|
+ if(params.check_output_strides)
|
|
+ {
|
|
+ check_output_strides<Tparams>(gpu_output, params);
|
|
+ }
|
|
+
|
|
+ // compute GPU output norm
|
|
+ std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() {
|
|
+ return norm(gpu_output,
|
|
+ params.olength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ params.otype,
|
|
+ params.ostride,
|
|
+ params.odist,
|
|
+ params.ooffset);
|
|
+ });
|
|
+
|
|
+ // compare GPU inverse output to CPU forward input
|
|
+ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
|
|
+ if(verbose > 1)
|
|
+ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
|
|
+ const double linf_cutoff
|
|
+ = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length);
|
|
+
|
|
+ VectorNorms diff = distance(cpu_input,
|
|
+ gpu_output,
|
|
+ params.olength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ contiguous_params.itype,
|
|
+ contiguous_params.istride,
|
|
+ contiguous_params.idist,
|
|
+ params.otype,
|
|
+ params.ostride,
|
|
+ params.odist,
|
|
+ linf_failures.get(),
|
|
+ linf_cutoff,
|
|
+ {0},
|
|
+ params.ooffset,
|
|
+ 1.0 / total_length);
|
|
+
|
|
+ if(verbose > 1)
|
|
+ {
|
|
+ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
|
|
+ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n";
|
|
+ std::cout << "GPU linf norm failures:";
|
|
+ std::sort(linf_failures->begin(), linf_failures->end());
|
|
+ for(const auto& i : *linf_failures)
|
|
+ {
|
|
+ std::cout << " (" << i.first << "," << i.second << ")";
|
|
+ }
|
|
+ std::cout << std::endl;
|
|
+ }
|
|
+
|
|
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
|
|
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
|
|
+
|
|
+ switch(params.precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ max_linf_eps_half
|
|
+ = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
|
|
+ max_l2_eps_half
|
|
+ = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ max_linf_eps_single
|
|
+ = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
|
|
+ max_l2_eps_single
|
|
+ = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ max_linf_eps_double
|
|
+ = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
|
|
+ max_l2_eps_double
|
|
+ = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if(verbose > 1)
|
|
+ {
|
|
+ std::cout << "L2 diff: " << diff.l_2 << "\n";
|
|
+ std::cout << "Linf diff: " << diff.l_inf << "\n";
|
|
+ }
|
|
+
|
|
+ EXPECT_TRUE(diff.l_inf <= linf_cutoff)
|
|
+ << "Linf test failed. Linf:" << diff.l_inf
|
|
+ << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff
|
|
+ << params.str();
|
|
+
|
|
+ EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2
|
|
+ < sqrt(log2(total_length)) * type_epsilon(params.precision))
|
|
+ << "L2 test failed. L2: " << diff.l_2
|
|
+ << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2
|
|
+ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
|
|
+ << params.str();
|
|
+}
|
|
+
|
|
+// RAII type to put data into the cache when this object leaves scope
|
|
+struct StoreCPUDataToCache
|
|
+{
|
|
+ StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output)
|
|
+ : cpu_input(cpu_input)
|
|
+ , cpu_output(cpu_output)
|
|
+ {
|
|
+ }
|
|
+ ~StoreCPUDataToCache()
|
|
+ {
|
|
+ last_cpu_fft_data.cpu_output.swap(cpu_output);
|
|
+ last_cpu_fft_data.cpu_input.swap(cpu_input);
|
|
+ }
|
|
+ std::vector<hostbuf>& cpu_input;
|
|
+ std::vector<hostbuf>& cpu_output;
|
|
+};
|
|
+
|
|
+// run CPU + rocFFT transform with the given params and compare
|
|
+template <class Tfloat, class Tparams>
|
|
+inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
|
|
+{
|
|
+ // Call hipGetLastError to reset any errors
|
|
+ // returned by previous HIP runtime API calls.
|
|
+ hipError_t hip_status = hipGetLastError();
|
|
+
|
|
+ // Make sure that the parameters make sense:
|
|
+ ASSERT_TRUE(params.valid(verbose));
|
|
+
|
|
+ size_t needed_ram = needed_ram_buffers(params, verbose);
|
|
+
|
|
+ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
|
|
+ {
|
|
+ GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb
|
|
+ << ".\n";
|
|
+ }
|
|
+
|
|
+ auto ibuffer_sizes = params.ibuffer_sizes();
|
|
+ auto obuffer_sizes = params.obuffer_sizes();
|
|
+
|
|
+ size_t vram_avail = 0;
|
|
+
|
|
+ if(vramgb == 0)
|
|
+ {
|
|
+ // Check free and total available memory:
|
|
+ size_t free = 0;
|
|
+ size_t total = 0;
|
|
+ auto hip_status = hipMemGetInfo(&free, &total);
|
|
+ if(hip_status != hipSuccess || total == 0)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ std::stringstream ss;
|
|
+ if(total == 0)
|
|
+ ss << "hipMemGetInfo claims there there isn't any vram";
|
|
+ else
|
|
+ ss << "hipMemGetInfo failure with error " << hip_status;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << ss.str();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << ss.str();
|
|
+ }
|
|
+ }
|
|
+ vram_avail = total;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ vram_avail = vramgb * ONE_GiB;
|
|
+ }
|
|
+
|
|
+ // First try a quick estimation of vram footprint, to speed up skipping tests
|
|
+ // that are too large to fit in the gpu (no plan created with the rocFFT backend)
|
|
+ const auto raw_vram_footprint
|
|
+ = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
|
|
+
|
|
+ if(!vram_fits_problem(raw_vram_footprint, vram_avail))
|
|
+ {
|
|
+ GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint)
|
|
+ << " GiB) raw data too large for device";
|
|
+ }
|
|
+
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ std::cout << "Raw problem size: " << raw_vram_footprint << std::endl;
|
|
+ }
|
|
+
|
|
+ // If it passed the quick estimation test, go for the more
|
|
+ // accurate calculation that actually creates the plan and
|
|
+ // take into account the work buffer size
|
|
+ const auto vram_footprint = params.vram_footprint();
|
|
+ if(!vram_fits_problem(vram_footprint, vram_avail))
|
|
+ {
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "Problem raw data won't fit on device; skipped." << std::endl;
|
|
+ }
|
|
+ GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint)
|
|
+ << " GiB) raw data too large for device";
|
|
+ }
|
|
+
|
|
+ // Create FFT plan - this will also allocate work buffer, but
|
|
+ // will throw a specific exception if that step fails
|
|
+ auto plan_status = fft_status_success;
|
|
+ try
|
|
+ {
|
|
+ plan_status = params.create_plan();
|
|
+ }
|
|
+ catch(fft_params::work_buffer_alloc_failure& e)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ std::stringstream ss;
|
|
+ ss << "Work buffer allocation failed with size: " << params.workbuffersize;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << ss.str();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << ss.str();
|
|
+ }
|
|
+ }
|
|
+ ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed";
|
|
+
|
|
+ if(!vram_fits_problem(vram_footprint, vram_avail))
|
|
+ {
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "Problem won't fit on device; skipped." << std::endl;
|
|
+ }
|
|
+ GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device";
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ fft_params contiguous_params;
|
|
+ contiguous_params.length = params.length;
|
|
+ contiguous_params.precision = params.precision;
|
|
+ contiguous_params.placement = fft_placement_notinplace;
|
|
+ contiguous_params.transform_type = params.transform_type;
|
|
+ contiguous_params.nbatch = params.nbatch;
|
|
+ contiguous_params.itype = contiguous_itype(params.transform_type);
|
|
+ contiguous_params.otype = contiguous_otype(contiguous_params.transform_type);
|
|
+
|
|
+ contiguous_params.validate();
|
|
+
|
|
+ if(!contiguous_params.valid(verbose))
|
|
+ {
|
|
+ throw std::runtime_error("Invalid contiguous params");
|
|
+ }
|
|
+
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "CPU params:\n";
|
|
+ std::cout << contiguous_params.str("\n\t") << std::endl;
|
|
+ }
|
|
+
|
|
+ std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
|
|
+ std::vector<void*> pibuffer(ibuffer_sizes.size());
|
|
+ for(unsigned int i = 0; i < ibuffer.size(); ++i)
|
|
+ {
|
|
+ hip_status = ibuffer[i].alloc(ibuffer_sizes[i]);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ std::stringstream ss;
|
|
+ ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "("
|
|
+ << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)"
|
|
+ << " with code " << hipError_to_string(hip_status);
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << ss.str();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << ss.str();
|
|
+ }
|
|
+ }
|
|
+ pibuffer[i] = ibuffer[i].data();
|
|
+ }
|
|
+
|
|
+ // allocation counts in elements, ibuffer_sizes is in bytes
|
|
+ auto ibuffer_sizes_elems = ibuffer_sizes;
|
|
+ for(auto& buf : ibuffer_sizes_elems)
|
|
+ buf /= var_size<size_t>(params.precision, params.itype);
|
|
+
|
|
+ // Check cache first - nbatch is a >= comparison because we compute
|
|
+ // the largest batch size and cache it. Smaller batch runs can
|
|
+ // compare against the larger data.
|
|
+ std::vector<hostbuf> cpu_input;
|
|
+ std::vector<hostbuf> cpu_output;
|
|
+ std::shared_future<void> convert_cpu_output_precision;
|
|
+ std::shared_future<void> convert_cpu_input_precision;
|
|
+ bool run_fftw = true;
|
|
+ std::unique_ptr<StoreCPUDataToCache> store_to_cache;
|
|
+ if(fftw_compare && last_cpu_fft_data.length == params.length
|
|
+ && last_cpu_fft_data.transform_type == params.transform_type
|
|
+ && last_cpu_fft_data.run_callbacks == params.run_callbacks)
|
|
+ {
|
|
+ if(last_cpu_fft_data.nbatch >= params.nbatch)
|
|
+ {
|
|
+ // use the cached input/output
|
|
+ cpu_input.swap(last_cpu_fft_data.cpu_input);
|
|
+ cpu_output.swap(last_cpu_fft_data.cpu_output);
|
|
+ run_fftw = false;
|
|
+
|
|
+ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
|
|
+
|
|
+ if(params.precision != last_cpu_fft_data.precision)
|
|
+ {
|
|
+ // Tests should be ordered so we do wider first, then narrower.
|
|
+ switch(params.precision)
|
|
+ {
|
|
+ case fft_precision_double:
|
|
+ std::cerr
|
|
+ << "test ordering is incorrect: double precision follows a narrower one"
|
|
+ << std::endl;
|
|
+ abort();
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ if(last_cpu_fft_data.precision != fft_precision_double)
|
|
+ {
|
|
+ std::cerr
|
|
+ << "test ordering is incorrect: float precision follows a narrower one"
|
|
+ << std::endl;
|
|
+ abort();
|
|
+ }
|
|
+ // convert the input/output to single-precision
|
|
+ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
|
|
+ narrow_precision_inplace<double, float>(cpu_output.front());
|
|
+ });
|
|
+ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
|
|
+ narrow_precision_inplace<double, float>(cpu_input.front());
|
|
+ });
|
|
+ break;
|
|
+ case fft_precision_half:
|
|
+ // convert to half precision
|
|
+ if(last_cpu_fft_data.precision == fft_precision_double)
|
|
+ {
|
|
+ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
|
|
+ narrow_precision_inplace<double, _Float16>(cpu_output.front());
|
|
+ });
|
|
+ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
|
|
+ narrow_precision_inplace<double, _Float16>(cpu_input.front());
|
|
+ });
|
|
+ }
|
|
+ else if(last_cpu_fft_data.precision == fft_precision_single)
|
|
+ {
|
|
+ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
|
|
+ narrow_precision_inplace<float, _Float16>(cpu_output.front());
|
|
+ });
|
|
+ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
|
|
+ narrow_precision_inplace<float, _Float16>(cpu_input.front());
|
|
+ });
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ std::cerr << "unhandled previous precision, cannot convert to half"
|
|
+ << std::endl;
|
|
+ abort();
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ last_cpu_fft_data.precision = params.precision;
|
|
+ }
|
|
+ }
|
|
+ // If the last result has a smaller batch than the new
|
|
+ // params, that might be a developer error - tests should be
|
|
+ // ordered to generate the bigger batch first. But if tests
|
|
+ // got filtered or skipped due to insufficient memory, we
|
|
+ // might never have tried to generate the bigger batch first.
|
|
+ // So just fall through and redo the CPU FFT.
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ // Clear cache explicitly so that even if we didn't get a hit,
|
|
+ // we're not uselessly holding on to cached cpu input/output
|
|
+ last_cpu_fft_data = last_cpu_fft_cache();
|
|
+ }
|
|
+
|
|
+ // Allocate CPU input
|
|
+ if(run_fftw)
|
|
+ {
|
|
+ cpu_input = allocate_cpu_fft_buffer(
|
|
+ contiguous_params.precision, contiguous_params.itype, contiguous_params.isize);
|
|
+ }
|
|
+
|
|
+ // Create FFTW plan - this may write to input, but that's fine
|
|
+ // since there's nothing in there right now
|
|
+ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan = nullptr;
|
|
+ if(run_fftw)
|
|
+ {
|
|
+ // Normally, we would want to defer allocation of CPU output
|
|
+ // buffer until when we actually do the CPU FFT. But if we're
|
|
+ // using FFTW wisdom, FFTW needs an output buffer at plan
|
|
+ // creation time.
|
|
+ if(use_fftw_wisdom)
|
|
+ {
|
|
+ cpu_output = allocate_cpu_fft_buffer(
|
|
+ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
|
|
+ }
|
|
+ cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length,
|
|
+ contiguous_params.istride,
|
|
+ contiguous_params.ostride,
|
|
+ contiguous_params.nbatch,
|
|
+ contiguous_params.idist,
|
|
+ contiguous_params.odist,
|
|
+ contiguous_params.transform_type,
|
|
+ cpu_input,
|
|
+ cpu_output);
|
|
+
|
|
+ needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose);
|
|
+
|
|
+ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
|
|
+ {
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]."
|
|
+ << std::endl;
|
|
+ }
|
|
+ GTEST_SKIP();
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ std::vector<hostbuf> gpu_input_data;
|
|
+
|
|
+ // allocate and populate the input buffer (cpu/gpu)
|
|
+ if(run_fftw)
|
|
+ {
|
|
+ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
|
|
+
|
|
+ //generate the input directly on the gpu
|
|
+ params.compute_input(ibuffer);
|
|
+
|
|
+ // Copy the input to CPU
|
|
+ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
|
|
+ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
|
|
+ {
|
|
+ // Copy input to CPU
|
|
+ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
|
|
+ {
|
|
+ hip_status = hipMemcpy(gpu_input_data.at(idx).data(),
|
|
+ ibuffer[idx].data(),
|
|
+ ibuffer_sizes[idx],
|
|
+ hipMemcpyDeviceToHost);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ copy_buffers(gpu_input_data,
|
|
+ cpu_input,
|
|
+ params.ilength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ params.itype,
|
|
+ params.istride,
|
|
+ params.idist,
|
|
+ contiguous_params.itype,
|
|
+ contiguous_params.istride,
|
|
+ contiguous_params.idist,
|
|
+ params.ioffset,
|
|
+ contiguous_params.ioffset);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ // Copy input to CPU
|
|
+ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
|
|
+ {
|
|
+ hip_status = hipMemcpy(cpu_input.at(idx).data(),
|
|
+ ibuffer[idx].data(),
|
|
+ ibuffer_sizes[idx],
|
|
+ hipMemcpyDeviceToHost);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ else if(fftw_compare)
|
|
+ {
|
|
+ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
|
|
+
|
|
+ // In case the cached cpu input needed conversion, wait for it
|
|
+ if(convert_cpu_input_precision.valid())
|
|
+ convert_cpu_input_precision.get();
|
|
+
|
|
+ // gets a pre-computed gpu input buffer from the cpu cache
|
|
+ std::vector<hostbuf>* gpu_input = &cpu_input;
|
|
+
|
|
+ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
|
|
+ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
|
|
+ {
|
|
+ copy_buffers(cpu_input,
|
|
+ gpu_input_data,
|
|
+ params.ilength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ contiguous_params.itype,
|
|
+ contiguous_params.istride,
|
|
+ contiguous_params.idist,
|
|
+ params.itype,
|
|
+ params.istride,
|
|
+ params.idist,
|
|
+ {0},
|
|
+ params.ioffset);
|
|
+ gpu_input = &gpu_input_data;
|
|
+ }
|
|
+
|
|
+ // Copy input to GPU
|
|
+ for(unsigned int idx = 0; idx < gpu_input->size(); ++idx)
|
|
+ {
|
|
+ hip_status = hipMemcpy(ibuffer[idx].data(),
|
|
+ gpu_input->at(idx).data(),
|
|
+ ibuffer_sizes[idx],
|
|
+ hipMemcpyHostToDevice);
|
|
+
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "CPU input:\n";
|
|
+ contiguous_params.print_ibuffer(cpu_input);
|
|
+ }
|
|
+
|
|
+ // compute input norm
|
|
+ std::shared_future<VectorNorms> cpu_input_norm;
|
|
+ if(fftw_compare)
|
|
+ cpu_input_norm = std::async(std::launch::async, [&]() {
|
|
+ // in case the cached cpu input needed conversion, wait for it
|
|
+ if(convert_cpu_input_precision.valid())
|
|
+ convert_cpu_input_precision.get();
|
|
+
|
|
+ auto input_norm = norm(cpu_input,
|
|
+ contiguous_params.ilength(),
|
|
+ contiguous_params.nbatch,
|
|
+ contiguous_params.precision,
|
|
+ contiguous_params.itype,
|
|
+ contiguous_params.istride,
|
|
+ contiguous_params.idist,
|
|
+ contiguous_params.ioffset);
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n";
|
|
+ std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n";
|
|
+ }
|
|
+ return input_norm;
|
|
+ });
|
|
+
|
|
+ std::vector<gpubuf> obuffer_data;
|
|
+ std::vector<gpubuf>* obuffer = &obuffer_data;
|
|
+ std::vector<void*> pobuffer;
|
|
+
|
|
+ // allocate the output buffer
|
|
+
|
|
+ if(params.placement == fft_placement_inplace)
|
|
+ {
|
|
+ obuffer = &ibuffer;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ auto obuffer_sizes = params.obuffer_sizes();
|
|
+ obuffer_data.resize(obuffer_sizes.size());
|
|
+ for(unsigned int i = 0; i < obuffer_data.size(); ++i)
|
|
+ {
|
|
+ hip_status = obuffer_data[i].alloc(obuffer_sizes[i]);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ std::stringstream ss;
|
|
+ ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
|
|
+ << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)"
|
|
+ << " with code " << hipError_to_string(hip_status);
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << ss.str();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << ss.str();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // If we're validating output strides, init the
|
|
+ // output buffer to a known pattern and we can check
|
|
+ // that the pattern is untouched in places that
|
|
+ // shouldn't have been touched.
|
|
+ if(params.check_output_strides)
|
|
+ {
|
|
+ hip_status
|
|
+ = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ ++n_hip_failures;
|
|
+ if(skip_runtime_fails)
|
|
+ {
|
|
+ GTEST_SKIP() << "hipMemset failure with error " << hip_status;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ GTEST_FAIL() << "hipMemset failure with error " << hip_status;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ pobuffer.resize(obuffer->size());
|
|
+ for(unsigned int i = 0; i < obuffer->size(); ++i)
|
|
+ {
|
|
+ pobuffer[i] = obuffer->at(i).data();
|
|
+ }
|
|
+
|
|
+ // Run CPU transform
|
|
+ //
|
|
+ // NOTE: This must happen after input is copied to GPU and input
|
|
+ // norm is computed, since the CPU FFT may overwrite the input.
|
|
+ VectorNorms cpu_output_norm;
|
|
+ std::shared_future<void> cpu_fft;
|
|
+ if(fftw_compare)
|
|
+ cpu_fft = std::async(std::launch::async, [&]() {
|
|
+ // wait for input norm to finish, since we might overwrite input
|
|
+ cpu_input_norm.get();
|
|
+
|
|
+ if(run_fftw)
|
|
+ execute_cpu_fft<Tfloat>(params, contiguous_params, cpu_plan, cpu_input, cpu_output);
|
|
+ // in case the cached cpu output needed conversion, wait for it
|
|
+ else if(convert_cpu_output_precision.valid())
|
|
+ convert_cpu_output_precision.get();
|
|
+
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "CPU output:\n";
|
|
+ contiguous_params.print_obuffer(cpu_output);
|
|
+ }
|
|
+
|
|
+ cpu_output_norm = norm(cpu_output,
|
|
+ params.olength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ contiguous_params.otype,
|
|
+ contiguous_params.ostride,
|
|
+ contiguous_params.odist,
|
|
+ contiguous_params.ooffset);
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n";
|
|
+ std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n";
|
|
+ }
|
|
+ });
|
|
+
|
|
+ // scatter data out to multi-GPUs if this is a multi-GPU test
|
|
+ params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
|
|
+
|
|
+ // execute GPU transform
|
|
+ std::vector<hostbuf> gpu_output
|
|
+ = allocate_host_buffer(params.precision, params.otype, params.osize);
|
|
+
|
|
+ execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output);
|
|
+
|
|
+ params.free();
|
|
+
|
|
+ if(params.check_output_strides)
|
|
+ {
|
|
+ check_output_strides<Tparams>(gpu_output, params);
|
|
+ }
|
|
+
|
|
+ // compute GPU output norm
|
|
+ std::shared_future<VectorNorms> gpu_norm;
|
|
+ if(fftw_compare)
|
|
+ gpu_norm = std::async(std::launch::async, [&]() {
|
|
+ return norm(gpu_output,
|
|
+ params.olength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ params.otype,
|
|
+ params.ostride,
|
|
+ params.odist,
|
|
+ params.ooffset);
|
|
+ });
|
|
+
|
|
+ // compare output
|
|
+ //
|
|
+ // Compute the l-infinity and l-2 distance between the CPU and GPU output:
|
|
+ // wait for cpu FFT so we can compute cutoff
|
|
+
|
|
+ const auto total_length = std::accumulate(params.length.begin(),
|
|
+ params.length.end(),
|
|
+ static_cast<size_t>(1),
|
|
+ std::multiplies<size_t>());
|
|
+
|
|
+ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
|
|
+ if(verbose > 1)
|
|
+ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
|
|
+ double linf_cutoff;
|
|
+ VectorNorms diff;
|
|
+
|
|
+ std::shared_future<void> compare_output;
|
|
+ if(fftw_compare)
|
|
+ compare_output = std::async(std::launch::async, [&]() {
|
|
+ cpu_fft.get();
|
|
+ linf_cutoff
|
|
+ = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
|
|
+
|
|
+ diff = distance(cpu_output,
|
|
+ gpu_output,
|
|
+ params.olength(),
|
|
+ params.nbatch,
|
|
+ params.precision,
|
|
+ contiguous_params.otype,
|
|
+ contiguous_params.ostride,
|
|
+ contiguous_params.odist,
|
|
+ params.otype,
|
|
+ params.ostride,
|
|
+ params.odist,
|
|
+ linf_failures.get(),
|
|
+ linf_cutoff,
|
|
+ {0},
|
|
+ params.ooffset);
|
|
+ });
|
|
+
|
|
+ // Update the cache if this current transform is different from
|
|
+ // what's stored. But if this transform only has a smaller batch
|
|
+ // than what's cached, we can still keep the cache around since
|
|
+ // the input/output we already have is still valid.
|
|
+ const bool update_last_cpu_fft_data
|
|
+ = last_cpu_fft_data.length != params.length
|
|
+ || last_cpu_fft_data.transform_type != params.transform_type
|
|
+ || last_cpu_fft_data.run_callbacks != params.run_callbacks
|
|
+ || last_cpu_fft_data.precision != params.precision
|
|
+ || params.nbatch > last_cpu_fft_data.nbatch;
|
|
+
|
|
+ // store cpu output in cache
|
|
+ if(update_last_cpu_fft_data)
|
|
+ {
|
|
+ last_cpu_fft_data.length = params.length;
|
|
+ last_cpu_fft_data.nbatch = params.nbatch;
|
|
+ last_cpu_fft_data.transform_type = params.transform_type;
|
|
+ last_cpu_fft_data.run_callbacks = params.run_callbacks;
|
|
+ last_cpu_fft_data.precision = params.precision;
|
|
+ }
|
|
+
|
|
+ if(compare_output.valid())
|
|
+ compare_output.get();
|
|
+
|
|
+ if(!store_to_cache)
|
|
+ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
|
|
+
|
|
+ Tparams params_inverse;
|
|
+
|
|
+ if(round_trip)
|
|
+ {
|
|
+ params_inverse.inverse_from_forward(params);
|
|
+
|
|
+ run_round_trip_inverse<Tparams>(
|
|
+ params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data);
|
|
+ }
|
|
+
|
|
+ if(fftw_compare)
|
|
+ {
|
|
+ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2));
|
|
+ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf));
|
|
+
|
|
+ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2));
|
|
+ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf));
|
|
+
|
|
+ if(verbose > 1)
|
|
+ {
|
|
+ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
|
|
+ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n";
|
|
+ std::cout << "GPU linf norm failures:";
|
|
+ std::sort(linf_failures->begin(), linf_failures->end());
|
|
+ for(const auto& i : *linf_failures)
|
|
+ {
|
|
+ std::cout << " (" << i.first << "," << i.second << ")";
|
|
+ }
|
|
+ std::cout << std::endl;
|
|
+ }
|
|
+
|
|
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
|
|
+ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
|
|
+ }
|
|
+
|
|
+ switch(params.precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ max_linf_eps_half
|
|
+ = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
|
|
+ max_l2_eps_half
|
|
+ = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ max_linf_eps_single
|
|
+ = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
|
|
+ max_l2_eps_single = std::max(max_l2_eps_single,
|
|
+ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ max_linf_eps_double
|
|
+ = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
|
|
+ max_l2_eps_double = std::max(max_l2_eps_double,
|
|
+ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if(verbose > 1)
|
|
+ {
|
|
+ std::cout << "L2 diff: " << diff.l_2 << "\n";
|
|
+ std::cout << "Linf diff: " << diff.l_inf << "\n";
|
|
+ }
|
|
+
|
|
+ if(fftw_compare)
|
|
+ {
|
|
+ EXPECT_TRUE(diff.l_inf <= linf_cutoff)
|
|
+ << "Linf test failed. Linf:" << diff.l_inf
|
|
+ << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf
|
|
+ << "\tcutoff: " << linf_cutoff << params.str();
|
|
+
|
|
+ EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2
|
|
+ < sqrt(log2(total_length)) * type_epsilon(params.precision))
|
|
+ << "L2 test failed. L2: " << diff.l_2
|
|
+ << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2
|
|
+ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
|
|
+ << params.str();
|
|
+ }
|
|
+
|
|
+ if(round_trip && fftw_compare)
|
|
+ {
|
|
+ compare_round_trip_inverse<Tparams>(params_inverse,
|
|
+ contiguous_params,
|
|
+ gpu_input_data,
|
|
+ cpu_input,
|
|
+ cpu_input_norm.get(),
|
|
+ total_length);
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/shared/arithmetic.h b/shared/arithmetic.h
|
|
new file mode 100644
|
|
index 0000000..774d342
|
|
--- /dev/null
|
|
+++ b/shared/arithmetic.h
|
|
@@ -0,0 +1,61 @@
|
|
+/******************************************************************************
|
|
+* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+*
|
|
+* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+* of this software and associated documentation files (the "Software"), to deal
|
|
+* in the Software without restriction, including without limitation the rights
|
|
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+* copies of the Software, and to permit persons to whom the Software is
|
|
+* furnished to do so, subject to the following conditions:
|
|
+*
|
|
+* The above copyright notice and this permission notice shall be included in
|
|
+* all copies or substantial portions of the Software.
|
|
+*
|
|
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+* THE SOFTWARE.
|
|
+*******************************************************************************/
|
|
+
|
|
+#pragma once
|
|
+
|
|
+#include <numeric>
|
|
+#include <stddef.h>
|
|
+
|
|
+// arithmetic helper functions
|
|
+
|
|
+static inline bool IsPo2(size_t u)
|
|
+{
|
|
+ return (u != 0) && (0 == (u & (u - 1)));
|
|
+}
|
|
+
|
|
+// help function: Find the smallest power of 2 that is >= n; return its
|
|
+// power of 2 factor
|
|
+// e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
|
|
+static inline size_t CeilPo2(size_t n)
|
|
+{
|
|
+ size_t v = 1, t = 0;
|
|
+ while(v < n)
|
|
+ {
|
|
+ v <<= 1;
|
|
+ t++;
|
|
+ }
|
|
+
|
|
+ return t;
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline T DivRoundingUp(T a, T b)
|
|
+{
|
|
+ return (a + (b - 1)) / b;
|
|
+}
|
|
+
|
|
+template <typename Titer>
|
|
+typename Titer::value_type product(Titer begin, Titer end)
|
|
+{
|
|
+ return std::accumulate(
|
|
+ begin, end, typename Titer::value_type(1), std::multiplies<typename Titer::value_type>());
|
|
+}
|
|
diff --git a/shared/array_predicate.h b/shared/array_predicate.h
|
|
new file mode 100644
|
|
index 0000000..92e45b4
|
|
--- /dev/null
|
|
+++ b/shared/array_predicate.h
|
|
@@ -0,0 +1,47 @@
|
|
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_ARRAY_PREDICATE_H
|
|
+#define ROCFFT_ARRAY_PREDICATE_H
|
|
+
|
|
+#include "rocfft/rocfft.h"
|
|
+
|
|
+namespace
|
|
+{
|
|
+ bool array_type_is_complex(rocfft_array_type type)
|
|
+ {
|
|
+ return type == rocfft_array_type_complex_interleaved
|
|
+ || type == rocfft_array_type_complex_planar
|
|
+ || type == rocfft_array_type_hermitian_interleaved
|
|
+ || type == rocfft_array_type_hermitian_planar;
|
|
+ }
|
|
+ bool array_type_is_interleaved(rocfft_array_type type)
|
|
+ {
|
|
+ return type == rocfft_array_type_complex_interleaved
|
|
+ || type == rocfft_array_type_hermitian_interleaved;
|
|
+ }
|
|
+ bool array_type_is_planar(rocfft_array_type type)
|
|
+ {
|
|
+ return type == rocfft_array_type_complex_planar
|
|
+ || type == rocfft_array_type_hermitian_planar;
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp
|
|
new file mode 100644
|
|
index 0000000..70abb08
|
|
--- /dev/null
|
|
+++ b/shared/array_validator.cpp
|
|
@@ -0,0 +1,549 @@
|
|
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#include <iostream>
|
|
+#include <numeric>
|
|
+#include <unordered_set>
|
|
+
|
|
+#include "array_validator.h"
|
|
+#include "increment.h"
|
|
+
|
|
+// Check a 2D array for collisions.
|
|
+// The 2D case can be determined via a number-theoretic argument.
|
|
+bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1)
|
|
+{
|
|
+ if(s0 == s1)
|
|
+ return false;
|
|
+ const auto c = std::lcm(s0, s1);
|
|
+ return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c));
|
|
+}
|
|
+
|
|
+// Compare a 1D direction with a multi-index hyperface for collisions.
|
|
+bool valid_length_stride_1d_multi(const unsigned int idx,
|
|
+ const std::vector<size_t> l,
|
|
+ const std::vector<size_t> s,
|
|
+ const int verbose)
|
|
+{
|
|
+ size_t l0{0}, s0{0};
|
|
+ std::vector<size_t> l1{}, s1{};
|
|
+ for(unsigned int i = 0; i < l.size(); ++i)
|
|
+ {
|
|
+ if(i == idx)
|
|
+ {
|
|
+ l0 = l[i];
|
|
+ s0 = s[i];
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ l1.push_back(l[i]);
|
|
+ s1.push_back(s[i]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(verbose > 4)
|
|
+ {
|
|
+ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
|
|
+ }
|
|
+
|
|
+ // We only need to go to the maximum pointer offset for (l1,s1).
|
|
+ const auto max_offset
|
|
+ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
|
|
+ - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
|
|
+ std::unordered_set<size_t> a0{};
|
|
+ for(size_t i = 1; i < l0; ++i)
|
|
+ {
|
|
+ const auto val = i * s0;
|
|
+ if(val <= max_offset)
|
|
+ a0.insert(val);
|
|
+ else
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if(verbose > 5)
|
|
+ {
|
|
+ std::cout << "a0:";
|
|
+ for(auto i : a0)
|
|
+ std::cout << " " << i;
|
|
+ std::cout << std::endl;
|
|
+
|
|
+ std::cout << "l1:";
|
|
+ for(auto i : l1)
|
|
+ std::cout << " " << i;
|
|
+ std::cout << std::endl;
|
|
+
|
|
+ std::cout << "s1:";
|
|
+ for(auto i : s1)
|
|
+ std::cout << " " << i;
|
|
+ std::cout << std::endl;
|
|
+ }
|
|
+
|
|
+ // TODO: this can be multi-threaded, since find(...) is thread-safe.
|
|
+ std::vector<size_t> index(l1.size());
|
|
+ std::fill(index.begin(), index.end(), 0);
|
|
+ do
|
|
+ {
|
|
+ const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0);
|
|
+ if(i > 0 && (i % s0 == 0))
|
|
+ {
|
|
+ // TODO: use an ordered set and binary search
|
|
+ if(verbose > 6)
|
|
+ std::cout << i << std::endl;
|
|
+ if(a0.find(i) != a0.end())
|
|
+ {
|
|
+ if(verbose > 4)
|
|
+ {
|
|
+ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
|
|
+ std::cout << "l1:";
|
|
+ for(const auto li : l1)
|
|
+ std::cout << " " << li;
|
|
+ std::cout << " s1:";
|
|
+ for(const auto si : s1)
|
|
+ std::cout << " " << si;
|
|
+ std::cout << std::endl;
|
|
+ std::cout << "Found duplicate: " << i << std::endl;
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ } while(increment_rowmajor(index, l1));
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+// Compare a hyperface with another hyperface for collisions.
|
|
+bool valid_length_stride_multi_multi(const std::vector<size_t> l0,
|
|
+ const std::vector<size_t> s0,
|
|
+ const std::vector<size_t> l1,
|
|
+ const std::vector<size_t> s1)
|
|
+{
|
|
+ std::unordered_set<size_t> a0{};
|
|
+
|
|
+ const auto max_offset
|
|
+ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
|
|
+ - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
|
|
+ std::vector<size_t> index0(l0.size()); // TODO: check this
|
|
+ std::fill(index0.begin(), index0.end(), 0);
|
|
+ do
|
|
+ {
|
|
+ const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0);
|
|
+ if(i > max_offset)
|
|
+ a0.insert(i);
|
|
+ } while(increment_rowmajor(index0, l0));
|
|
+
|
|
+ std::vector<size_t> index1(l1.size());
|
|
+ std::fill(index1.begin(), index1.end(), 0);
|
|
+ do
|
|
+ {
|
|
+ const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0);
|
|
+ if(i > 0)
|
|
+ {
|
|
+ // TODO: use an ordered set and binary search
|
|
+ if(a0.find(i) != a0.end())
|
|
+ {
|
|
+
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ } while(increment_rowmajor(index1, l1));
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool valid_length_stride_3d(const std::vector<size_t>& l,
|
|
+ const std::vector<size_t>& s,
|
|
+ const int verbose)
|
|
+{
|
|
+ // Check that 2D faces are valid:
|
|
+ if(!valid_length_stride_2d(l[0], l[1], s[0], s[1]))
|
|
+ return false;
|
|
+ if(!valid_length_stride_2d(l[0], l[2], s[0], s[2]))
|
|
+ return false;
|
|
+ if(!valid_length_stride_2d(l[1], l[2], s[1], s[2]))
|
|
+ return false;
|
|
+
|
|
+ // If the 2D faces are valid, check an axis vs a face for collisions:
|
|
+ bool invalid = false;
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for
|
|
+#endif
|
|
+ for(int idx = 0; idx < 3; ++idx)
|
|
+ {
|
|
+ if(!valid_length_stride_1d_multi(idx, l, s, verbose))
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp cancel for
|
|
+#endif
|
|
+ invalid = true;
|
|
+ }
|
|
+ }
|
|
+ if(invalid)
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool valid_length_stride_4d(const std::vector<size_t>& l,
|
|
+ const std::vector<size_t>& s,
|
|
+ const int verbose)
|
|
+{
|
|
+ if(l.size() != 4)
|
|
+ {
|
|
+ throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d");
|
|
+ }
|
|
+
|
|
+ // Check that 2D faces are valid:
|
|
+ for(int idx0 = 0; idx0 < 3; ++idx0)
|
|
+ {
|
|
+ for(int idx1 = idx0 + 1; idx1 < 4; ++idx1)
|
|
+ {
|
|
+ if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1]))
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bool invalid = false;
|
|
+ // Check that 1D vs 3D faces are valid:
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for
|
|
+#endif
|
|
+ for(int idx0 = 0; idx0 < 4; ++idx0)
|
|
+ {
|
|
+ if(!valid_length_stride_1d_multi(idx0, l, s, verbose))
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp cancel for
|
|
+#endif
|
|
+ invalid = true;
|
|
+ }
|
|
+ }
|
|
+ if(invalid)
|
|
+ return false;
|
|
+
|
|
+ // Check that 2D vs 2D faces are valid:
|
|
+
|
|
+ // First, get all the permutations
|
|
+ std::vector<std::vector<size_t>> perms;
|
|
+ std::vector<size_t> v(l.size());
|
|
+ std::fill(v.begin(), v.begin() + 2, 0);
|
|
+ std::fill(v.begin() + 2, v.end(), 1);
|
|
+ do
|
|
+ {
|
|
+ perms.push_back(v);
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "v:";
|
|
+ for(const auto i : v)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ }
|
|
+ } while(std::next_permutation(v.begin(), v.end()));
|
|
+
|
|
+ // Then loop over all of the permutations.
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for
|
|
+#endif
|
|
+ for(size_t iperm = 0; iperm < perms.size(); ++iperm)
|
|
+ {
|
|
+ std::vector<size_t> l0(2);
|
|
+ std::vector<size_t> s0(2);
|
|
+ std::vector<size_t> l1(2);
|
|
+ std::vector<size_t> s1(2);
|
|
+ for(size_t i = 0; i < l.size(); ++i)
|
|
+ {
|
|
+ if(perms[iperm][i] == 0)
|
|
+ {
|
|
+ l0.push_back(l[i]);
|
|
+ s0.push_back(s[i]);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ l1.push_back(l[i]);
|
|
+ s1.push_back(s[i]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "\tl0:";
|
|
+ for(const auto i : l0)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ std::cout << "\ts0:";
|
|
+ for(const auto i : s0)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ std::cout << "\tl1:";
|
|
+ for(const auto i : l1)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ std::cout << "\ts1:";
|
|
+ for(const auto i : s1)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ }
|
|
+
|
|
+ if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp cancel for
|
|
+#endif
|
|
+ invalid = true;
|
|
+ }
|
|
+ }
|
|
+ if(invalid)
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool valid_length_stride_generald(const std::vector<size_t> l,
|
|
+ const std::vector<size_t> s,
|
|
+ const int verbose)
|
|
+{
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ std::cout << "checking dimension " << l.size() << std::endl;
|
|
+ }
|
|
+
|
|
+ // Recurse on d-1 hyper-faces:
|
|
+ for(unsigned int idx = 0; idx < l.size(); ++idx)
|
|
+ {
|
|
+ std::vector<size_t> l0{};
|
|
+ std::vector<size_t> s0{};
|
|
+ for(size_t i = 0; i < l.size(); ++i)
|
|
+ {
|
|
+ if(i != idx)
|
|
+ {
|
|
+ l0.push_back(l[i]);
|
|
+ s0.push_back(s[i]);
|
|
+ }
|
|
+ }
|
|
+ if(!array_valid(l0, s0, verbose))
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Handle the 1D vs (N-1) case:
|
|
+ for(unsigned int idx = 0; idx < l.size(); ++idx)
|
|
+ {
|
|
+ if(!valid_length_stride_1d_multi(idx, l, s, verbose))
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0)
|
|
+ {
|
|
+ const size_t dim1 = l.size() - dim0;
|
|
+ if(verbose > 2)
|
|
+ std::cout << "dims: " << dim0 << " " << dim1 << std::endl;
|
|
+
|
|
+ // We iterate over all permutations of an array of length l.size() which contains dim0 zeros
|
|
+ // and dim1 ones. We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the
|
|
+ // possibilities.
|
|
+
|
|
+ // First, get all the permutations
|
|
+ std::vector<std::vector<size_t>> perms;
|
|
+ std::vector<size_t> v(l.size());
|
|
+ std::fill(v.begin(), v.begin() + dim1, 0);
|
|
+ std::fill(v.begin() + dim1, v.end(), 1);
|
|
+ do
|
|
+ {
|
|
+ perms.push_back(v);
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "v:";
|
|
+ for(const auto i : v)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ }
|
|
+
|
|
+ } while(std::next_permutation(v.begin(), v.end()));
|
|
+
|
|
+ bool invalid = false;
|
|
+ // Then loop over all of the permutations.
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for
|
|
+#endif
|
|
+ for(size_t iperm = 0; iperm < perms.size(); ++iperm)
|
|
+ {
|
|
+ std::vector<size_t> l0(dim0);
|
|
+ std::vector<size_t> s0(dim0);
|
|
+ std::vector<size_t> l1(dim1);
|
|
+ std::vector<size_t> s1(dim1);
|
|
+
|
|
+ for(size_t i = 0; i < l.size(); ++i)
|
|
+ {
|
|
+ if(v[i] == 0)
|
|
+ {
|
|
+ l0.push_back(l[i]);
|
|
+ s0.push_back(s[i]);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ l1.push_back(l[i]);
|
|
+ s1.push_back(s[i]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(verbose > 3)
|
|
+ {
|
|
+ std::cout << "\tl0:";
|
|
+ for(const auto i : l0)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ std::cout << "\ts0:";
|
|
+ for(const auto i : s0)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ std::cout << "\tl1:";
|
|
+ for(const auto i : l1)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ std::cout << "\ts1:";
|
|
+ for(const auto i : s1)
|
|
+ {
|
|
+ std::cout << " " << i;
|
|
+ }
|
|
+ std::cout << "\n";
|
|
+ }
|
|
+
|
|
+ if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp cancel for
|
|
+#endif
|
|
+ invalid = true;
|
|
+ }
|
|
+ }
|
|
+ if(invalid)
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool sort_by_stride(const std::pair<size_t, size_t>& ls0, const std::pair<size_t, size_t>& ls1)
|
|
+{
|
|
+ return ls0.second < ls1.second;
|
|
+}
|
|
+
|
|
+bool array_valid(const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& stride,
|
|
+ const int verbose)
|
|
+{
|
|
+ if(length.size() != stride.size())
|
|
+ return false;
|
|
+
|
|
+ // If a length is 1, then the stride is irrelevant.
|
|
+ // If a length is > 1, then the corresponding stride must be > 1.
|
|
+ std::vector<size_t> l{}, s{};
|
|
+ for(unsigned int i = 0; i < length.size(); ++i)
|
|
+ {
|
|
+ if(length[i] > 1)
|
|
+ {
|
|
+ if(stride[i] == 0)
|
|
+ return false;
|
|
+ l.push_back(length[i]);
|
|
+ s.push_back(stride[i]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(length.size() > 1)
|
|
+ {
|
|
+ // Check happy path.
|
|
+ bool happy_path = true;
|
|
+ std::vector<std::pair<size_t, size_t>> ls;
|
|
+ for(size_t idx = 0; idx < length.size(); ++idx)
|
|
+ {
|
|
+ ls.push_back(std::pair(length[idx], stride[idx]));
|
|
+ }
|
|
+ std::sort(ls.begin(), ls.end(), sort_by_stride);
|
|
+
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ for(size_t idx = 0; idx < ls.size(); ++idx)
|
|
+ {
|
|
+ std::cout << ls[idx].first << "\t" << ls[idx].second << "\n";
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for(size_t idx = 1; idx < ls.size(); ++idx)
|
|
+ {
|
|
+ if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second)
|
|
+ {
|
|
+ happy_path = false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if(happy_path)
|
|
+ {
|
|
+ if(verbose > 2)
|
|
+ {
|
|
+ std::cout << "happy path\n";
|
|
+ }
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ switch(l.size())
|
|
+ {
|
|
+ case 0:
|
|
+ return true;
|
|
+ break;
|
|
+ case 1:
|
|
+ return s[0] != 0;
|
|
+ break;
|
|
+ case 2:
|
|
+ {
|
|
+ return valid_length_stride_2d(l[0], l[1], s[0], s[1]);
|
|
+ break;
|
|
+ }
|
|
+ case 3:
|
|
+ {
|
|
+ return valid_length_stride_3d(l, s, verbose);
|
|
+ break;
|
|
+ }
|
|
+ case 4:
|
|
+ {
|
|
+ return valid_length_stride_4d(l, s, verbose);
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ return valid_length_stride_generald(l, s, verbose);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
diff --git a/shared/array_validator.h b/shared/array_validator.h
|
|
new file mode 100644
|
|
index 0000000..ce85173
|
|
--- /dev/null
|
|
+++ b/shared/array_validator.h
|
|
@@ -0,0 +1,31 @@
|
|
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ARRAY_VALIDATOR_H
|
|
+#define ARRAY_VALIDATOR_H
|
|
+
|
|
+#include <vector>
|
|
+
|
|
+// Checks whether the array with given length and stride has multi-index collisions.
|
|
+bool array_valid(const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& stride,
|
|
+ const int verbose = 0);
|
|
+
|
|
+#endif
|
|
diff --git a/shared/concurrency.h b/shared/concurrency.h
|
|
new file mode 100644
|
|
index 0000000..a36c7c1
|
|
--- /dev/null
|
|
+++ b/shared/concurrency.h
|
|
@@ -0,0 +1,41 @@
|
|
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+
|
|
+#include <thread>
|
|
+
|
|
+#ifndef WIN32
|
|
+#include <sched.h>
|
|
+#endif
|
|
+
|
|
+// work out how many parallel tasks to run, based on available
|
|
+// resources. on Linux, this will look at the cpu affinity mask (if
|
|
+// available) which might be restricted in a container. otherwise,
|
|
+// return std::thread::hardware_concurrency().
|
|
+static unsigned int rocfft_concurrency()
|
|
+{
|
|
+#ifndef WIN32
|
|
+ cpu_set_t cpuset;
|
|
+ if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0)
|
|
+ return CPU_COUNT(&cpuset);
|
|
+#endif
|
|
+ return std::thread::hardware_concurrency();
|
|
+}
|
|
diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h
|
|
new file mode 100644
|
|
index 0000000..77fb012
|
|
--- /dev/null
|
|
+++ b/shared/data_gen_device.h
|
|
@@ -0,0 +1,1303 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef DATA_GEN_DEVICE_H
|
|
+#define DATA_GEN_DEVICE_H
|
|
+
|
|
+// rocRAND can generate warnings if inline asm is not available for
|
|
+// some architectures. data generation isn't performance-critical,
|
|
+// so just disable inline asm to prevent the warnings.
|
|
+#define ROCRAND_DISABLE_INLINE_ASM
|
|
+
|
|
+#include "../shared/arithmetic.h"
|
|
+#include "../shared/device_properties.h"
|
|
+#include "../shared/gpubuf.h"
|
|
+#include "../shared/increment.h"
|
|
+#include "../shared/rocfft_complex.h"
|
|
+#include <hip/hip_runtime.h>
|
|
+#include <hip/hip_runtime_api.h>
|
|
+#include <hiprand/hiprand.h>
|
|
+#include <hiprand/hiprand_kernel.h>
|
|
+#include <limits>
|
|
+#include <vector>
|
|
+
|
|
+static const unsigned int DATA_GEN_THREADS = 8;
|
|
+static const unsigned int DATA_GEN_GRID_Y_MAX = 64;
|
|
+
|
|
+template <typename T>
|
|
+struct input_val_1D
|
|
+{
|
|
+ T val1;
|
|
+};
|
|
+
|
|
+template <typename T>
|
|
+struct input_val_2D
|
|
+{
|
|
+ T val1;
|
|
+ T val2;
|
|
+};
|
|
+
|
|
+template <typename T>
|
|
+struct input_val_3D
|
|
+{
|
|
+ T val1;
|
|
+ T val2;
|
|
+ T val3;
|
|
+};
|
|
+
|
|
+template <typename T>
|
|
+static input_val_1D<T> get_input_val(const T& val)
|
|
+{
|
|
+ return input_val_1D<T>{val};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
|
|
+{
|
|
+ return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
|
|
+{
|
|
+ return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static size_t
|
|
+ compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
|
|
+{
|
|
+ return (length.val1 * stride.val1) + base;
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static size_t
|
|
+ compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
|
|
+{
|
|
+ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static size_t
|
|
+ compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
|
|
+{
|
|
+ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
|
|
+ + base;
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
|
|
+{
|
|
+ return input_val_1D<T>{0};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
|
|
+{
|
|
+ return input_val_2D<T>{0, 0};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
|
|
+{
|
|
+ return input_val_3D<T>{0, 0, 0};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline input_val_1D<T> make_unit_stride(const input_val_1D<T>& whole_length)
|
|
+{
|
|
+ return input_val_1D<T>{1};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline input_val_2D<T> make_unit_stride(const input_val_2D<T>& whole_length)
|
|
+{
|
|
+ return input_val_2D<T>{1, whole_length.val1};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+static inline input_val_3D<T> make_unit_stride(const input_val_3D<T>& whole_length)
|
|
+{
|
|
+ return input_val_3D<T>{1, whole_length.val1, whole_length.val1 * whole_length.val2};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
|
|
+{
|
|
+ auto xlen = whole_length.val1;
|
|
+
|
|
+ auto xidx = i % xlen;
|
|
+
|
|
+ return input_val_1D<T>{xidx};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
|
|
+{
|
|
+ auto xlen = whole_length.val1;
|
|
+ auto ylen = whole_length.val2;
|
|
+
|
|
+ auto xidx = i % xlen;
|
|
+ auto yidx = i / xlen % ylen;
|
|
+
|
|
+ return input_val_2D<T>{xidx, yidx};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
|
|
+{
|
|
+ auto xlen = whole_length.val1;
|
|
+ auto ylen = whole_length.val2;
|
|
+ auto zlen = whole_length.val3;
|
|
+
|
|
+ auto xidx = i % xlen;
|
|
+ auto yidx = i / xlen % ylen;
|
|
+ auto zidx = i / xlen / ylen % zlen;
|
|
+
|
|
+ return input_val_3D<T>{xidx, yidx, zidx};
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
|
|
+{
|
|
+ auto xlen = whole_length.val1;
|
|
+
|
|
+ auto yidx = i / xlen;
|
|
+
|
|
+ return yidx;
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
|
|
+{
|
|
+ auto xlen = whole_length.val1;
|
|
+ auto ylen = whole_length.val2;
|
|
+
|
|
+ auto zidx = i / xlen / ylen;
|
|
+
|
|
+ return zidx;
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
|
|
+{
|
|
+ auto xlen = length.val1;
|
|
+ auto ylen = length.val2;
|
|
+ auto zlen = length.val3;
|
|
+
|
|
+ auto widx = i / xlen / ylen / zlen;
|
|
+
|
|
+ return widx;
|
|
+}
|
|
+
|
|
+__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset)
|
|
+{
|
|
+ return hiprand_uniform_double(gen_state) + offset;
|
|
+}
|
|
+
|
|
+__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset)
|
|
+{
|
|
+ return hiprand_uniform(gen_state) + offset;
|
|
+}
|
|
+
|
|
+__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset)
|
|
+{
|
|
+ return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset;
|
|
+}
|
|
+
|
|
+template <typename Tcomplex>
|
|
+__device__ static void set_imag_zero(const size_t pos, Tcomplex* x)
|
|
+{
|
|
+ x[pos].y = 0.0;
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag)
|
|
+{
|
|
+ ximag[pos] = 0.0;
|
|
+}
|
|
+
|
|
+template <typename Tcomplex>
|
|
+__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x)
|
|
+{
|
|
+ x[pos].x = x[cpos].x;
|
|
+ x[pos].y = -x[cpos].y;
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag)
|
|
+{
|
|
+ xreal[pos] = xreal[cpos];
|
|
+ ximag[pos] = -ximag[cpos];
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
|
|
+ generate_random_interleaved_data_kernel(const Tint whole_length,
|
|
+ const Tint zero_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint istride,
|
|
+ rocfft_complex<Treal>* data)
|
|
+{
|
|
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
|
|
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
|
|
+ static_assert(sizeof(i) >= sizeof(isize));
|
|
+ if(i < isize)
|
|
+ {
|
|
+ auto i_length = get_length(i, whole_length);
|
|
+ auto i_batch = get_batch(i, whole_length);
|
|
+ auto i_base = i_batch * idist;
|
|
+
|
|
+ auto seed = compute_index(zero_length, istride, i_base);
|
|
+ auto idx = compute_index(i_length, istride, i_base);
|
|
+
|
|
+ hiprandStatePhilox4_32_10 gen_state;
|
|
+ hiprand_init(seed, idx, 0, &gen_state);
|
|
+
|
|
+ data[idx].x = make_random_val(&gen_state, static_cast<Treal>(-0.5));
|
|
+ data[idx].y = make_random_val(&gen_state, static_cast<Treal>(-0.5));
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
|
|
+ generate_interleaved_data_kernel(const Tint whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint istride,
|
|
+ const Tint ustride,
|
|
+ const Treal inv_scale,
|
|
+ rocfft_complex<Treal>* data)
|
|
+{
|
|
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
|
|
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
|
|
+ static_assert(sizeof(i) >= sizeof(isize));
|
|
+ if(i < isize)
|
|
+ {
|
|
+ const auto i_length = get_length(i, whole_length);
|
|
+ const auto i_batch = get_batch(i, whole_length);
|
|
+ const auto i_base = i_batch * idist;
|
|
+
|
|
+ const auto val = static_cast<Treal>(-0.5)
|
|
+ + static_cast<Treal>(
|
|
+ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
|
|
+ * inv_scale;
|
|
+
|
|
+ const auto idx = compute_index(i_length, istride, i_base);
|
|
+
|
|
+ data[idx].x = val;
|
|
+ data[idx].y = val;
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
|
|
+ generate_random_planar_data_kernel(const Tint whole_length,
|
|
+ const Tint zero_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint istride,
|
|
+ Treal* real_data,
|
|
+ Treal* imag_data)
|
|
+{
|
|
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
|
|
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
|
|
+ static_assert(sizeof(i) >= sizeof(isize));
|
|
+ if(i < isize)
|
|
+ {
|
|
+ auto i_length = get_length(i, whole_length);
|
|
+ auto i_batch = get_batch(i, whole_length);
|
|
+ auto i_base = i_batch * idist;
|
|
+
|
|
+ auto seed = compute_index(zero_length, istride, i_base);
|
|
+ auto idx = compute_index(i_length, istride, i_base);
|
|
+
|
|
+ hiprandStatePhilox4_32_10 gen_state;
|
|
+ hiprand_init(seed, idx, 0, &gen_state);
|
|
+
|
|
+ real_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
|
|
+ imag_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
|
|
+ generate_planar_data_kernel(const Tint whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint istride,
|
|
+ const Tint ustride,
|
|
+ const Treal inv_scale,
|
|
+ Treal* real_data,
|
|
+ Treal* imag_data)
|
|
+{
|
|
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
|
|
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
|
|
+ static_assert(sizeof(i) >= sizeof(isize));
|
|
+ if(i < isize)
|
|
+ {
|
|
+ const auto i_length = get_length(i, whole_length);
|
|
+ const auto i_batch = get_batch(i, whole_length);
|
|
+ const auto i_base = i_batch * idist;
|
|
+
|
|
+ const auto val = static_cast<Treal>(-0.5)
|
|
+ + static_cast<Treal>(
|
|
+ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
|
|
+ * inv_scale;
|
|
+
|
|
+ const auto idx = compute_index(i_length, istride, i_base);
|
|
+
|
|
+ real_data[idx] = val;
|
|
+ imag_data[idx] = val;
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
|
|
+ generate_random_real_data_kernel(const Tint whole_length,
|
|
+ const Tint zero_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint istride,
|
|
+ Treal* data)
|
|
+{
|
|
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
|
|
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
|
|
+ static_assert(sizeof(i) >= sizeof(isize));
|
|
+ if(i < isize)
|
|
+ {
|
|
+ auto i_length = get_length(i, whole_length);
|
|
+ auto i_batch = get_batch(i, whole_length);
|
|
+ auto i_base = i_batch * idist;
|
|
+
|
|
+ auto seed = compute_index(zero_length, istride, i_base);
|
|
+ auto idx = compute_index(i_length, istride, i_base);
|
|
+
|
|
+ hiprandStatePhilox4_32_10 gen_state;
|
|
+ hiprand_init(seed, idx, 0, &gen_state);
|
|
+
|
|
+ data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
|
|
+ generate_real_data_kernel(const Tint whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint istride,
|
|
+ const Tint ustride,
|
|
+ const Treal inv_scale,
|
|
+ Treal* data)
|
|
+{
|
|
+ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
|
|
+ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
|
|
+ static_assert(sizeof(i) >= sizeof(isize));
|
|
+ if(i < isize)
|
|
+ {
|
|
+ const auto i_length = get_length(i, whole_length);
|
|
+ const auto i_batch = get_batch(i, whole_length);
|
|
+ const auto i_base = i_batch * idist;
|
|
+
|
|
+ const auto val = static_cast<Treal>(-0.5)
|
|
+ + static_cast<Treal>(
|
|
+ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
|
|
+ * inv_scale;
|
|
+
|
|
+ const auto idx = compute_index(i_length, istride, i_base);
|
|
+
|
|
+ data[idx] = val;
|
|
+ }
|
|
+}
|
|
+
|
|
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
|
|
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
|
|
+// space. For multi-dimensional data, this means that we only need to store a bit more
|
|
+// than half of the complex values; the rest are redundant. However, there are still
|
|
+// some restrictions:
|
|
+// * the origin and Nyquist value(s) must be real-valued
|
|
+// * some of the remaining values are still redundant, and you might get different results
|
|
+// than you expect if the values don't agree.
|
|
+
|
|
+template <typename Tcomplex>
|
|
+__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex* x,
|
|
+ const size_t Nx,
|
|
+ const size_t xstride,
|
|
+ const size_t dist,
|
|
+ const size_t batch_total,
|
|
+ const bool Nxeven)
|
|
+{
|
|
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
|
|
+ static_assert(sizeof(id_batch) == sizeof(size_t));
|
|
+
|
|
+ if(id_batch < batch_total)
|
|
+ {
|
|
+ id_batch *= dist;
|
|
+
|
|
+ set_imag_zero(id_batch, x);
|
|
+
|
|
+ if(Nxeven)
|
|
+ set_imag_zero(id_batch + (Nx / 2) * xstride, x);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat* xreal,
|
|
+ Tfloat* ximag,
|
|
+ const size_t Nx,
|
|
+ const size_t xstride,
|
|
+ const size_t dist,
|
|
+ const size_t batch_total,
|
|
+ const bool Nxeven)
|
|
+{
|
|
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
|
|
+ static_assert(sizeof(id_batch) == sizeof(size_t));
|
|
+
|
|
+ if(id_batch < batch_total)
|
|
+ {
|
|
+ id_batch *= dist;
|
|
+
|
|
+ set_imag_zero(id_batch, xreal, ximag);
|
|
+
|
|
+ if(Nxeven)
|
|
+ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tcomplex>
|
|
+__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex* x,
|
|
+ const size_t Nx,
|
|
+ const size_t Ny,
|
|
+ const size_t xstride,
|
|
+ const size_t ystride,
|
|
+ const size_t dist,
|
|
+ const size_t batch_total,
|
|
+ const size_t x_total,
|
|
+ const bool Nxeven,
|
|
+ const bool Nyeven)
|
|
+{
|
|
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
|
|
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
|
|
+ static_assert(sizeof(id_batch) == sizeof(size_t));
|
|
+ static_assert(sizeof(id_x) == sizeof(size_t));
|
|
+
|
|
+ if(id_batch < batch_total)
|
|
+ {
|
|
+ id_batch *= dist;
|
|
+
|
|
+ if(id_x == 0)
|
|
+ set_imag_zero(id_batch, x);
|
|
+
|
|
+ if(id_x == 0 && Nxeven)
|
|
+ set_imag_zero(id_batch + (Nx / 2) * xstride, x);
|
|
+
|
|
+ if(id_x == 0 && Nyeven)
|
|
+ set_imag_zero(id_batch + ystride * (Ny / 2), x);
|
|
+
|
|
+ if(id_x == 0 && Nxeven && Nyeven)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
|
|
+
|
|
+ if(id_x < x_total)
|
|
+ {
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
|
|
+
|
|
+ if(Nyeven)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
|
|
+ x);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat* xreal,
|
|
+ Tfloat* ximag,
|
|
+ const size_t Nx,
|
|
+ const size_t Ny,
|
|
+ const size_t xstride,
|
|
+ const size_t ystride,
|
|
+ const size_t dist,
|
|
+ const size_t batch_total,
|
|
+ const size_t x_total,
|
|
+ const bool Nxeven,
|
|
+ const bool Nyeven)
|
|
+{
|
|
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
|
|
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
|
|
+ static_assert(sizeof(id_batch) == sizeof(size_t));
|
|
+ static_assert(sizeof(id_x) == sizeof(size_t));
|
|
+
|
|
+ if(id_batch < batch_total)
|
|
+ {
|
|
+ id_batch *= dist;
|
|
+
|
|
+ if(id_x == 0)
|
|
+ set_imag_zero(id_batch, xreal, ximag);
|
|
+
|
|
+ if(id_x == 0 && Nxeven)
|
|
+ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
|
|
+
|
|
+ if(id_x == 0 && Nyeven)
|
|
+ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
|
|
+
|
|
+ if(id_x == 0 && Nxeven && Nyeven)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
|
|
+
|
|
+ if(id_x < x_total)
|
|
+ {
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)),
|
|
+ id_batch + xstride * (id_x + 1),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(Nyeven)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tcomplex>
|
|
+__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex* x,
|
|
+ const size_t Nx,
|
|
+ const size_t Ny,
|
|
+ const size_t Nz,
|
|
+ const size_t xstride,
|
|
+ const size_t ystride,
|
|
+ const size_t zstride,
|
|
+ const size_t dist,
|
|
+ const size_t batch_total,
|
|
+ const size_t x_total,
|
|
+ const size_t y_total,
|
|
+ const size_t y_total_half,
|
|
+ const bool Nxeven,
|
|
+ const bool Nyeven,
|
|
+ const bool Nzeven)
|
|
+{
|
|
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
|
|
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
|
|
+ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
|
|
+ static_assert(sizeof(id_batch) == sizeof(size_t));
|
|
+ static_assert(sizeof(id_x) == sizeof(size_t));
|
|
+ static_assert(sizeof(id_y) == sizeof(size_t));
|
|
+
|
|
+ if(id_batch < batch_total)
|
|
+ {
|
|
+ auto id_x_y_zero = (id_x == 0 && id_y == 0);
|
|
+
|
|
+ id_batch *= dist;
|
|
+
|
|
+ if(id_x_y_zero)
|
|
+ set_imag_zero(id_batch, x);
|
|
+
|
|
+ if(Nxeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2), x);
|
|
+
|
|
+ if(Nyeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + ystride * (Ny / 2), x);
|
|
+
|
|
+ if(Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + zstride * (Nz / 2), x);
|
|
+
|
|
+ if(Nxeven && Nyeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
|
|
+
|
|
+ if(Nxeven && Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x);
|
|
+
|
|
+ if(Nyeven && Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x);
|
|
+
|
|
+ if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
|
|
+ x);
|
|
+
|
|
+ if(id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x);
|
|
+
|
|
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
|
|
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
|
|
+ x);
|
|
+
|
|
+ if(id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
|
|
+
|
|
+ if(Nyeven && id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
|
|
+ x);
|
|
+
|
|
+ if(id_x < x_total && id_y < y_total)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
|
|
+ x);
|
|
+
|
|
+ if(Nzeven)
|
|
+ {
|
|
+ if(id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
|
|
+ x);
|
|
+
|
|
+ if(Nyeven && id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
|
|
+ x);
|
|
+
|
|
+ if(id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
|
|
+ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
|
|
+ x);
|
|
+
|
|
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
|
|
+ + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
|
|
+ x);
|
|
+
|
|
+ if(id_x < x_total && id_y < y_total)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
|
|
+ + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
|
|
+ + zstride * (Nz / 2),
|
|
+ x);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat* xreal,
|
|
+ Tfloat* ximag,
|
|
+ const size_t Nx,
|
|
+ const size_t Ny,
|
|
+ const size_t Nz,
|
|
+ const size_t xstride,
|
|
+ const size_t ystride,
|
|
+ const size_t zstride,
|
|
+ const size_t dist,
|
|
+ const size_t batch_total,
|
|
+ const size_t x_total,
|
|
+ const size_t y_total,
|
|
+ const size_t y_total_half,
|
|
+ const bool Nxeven,
|
|
+ const bool Nyeven,
|
|
+ const bool Nzeven)
|
|
+{
|
|
+ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
|
|
+ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
|
|
+ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
|
|
+ static_assert(sizeof(id_batch) == sizeof(size_t));
|
|
+ static_assert(sizeof(id_x) == sizeof(size_t));
|
|
+ static_assert(sizeof(id_y) == sizeof(size_t));
|
|
+
|
|
+ if(id_batch < batch_total)
|
|
+ {
|
|
+ auto id_x_y_zero = (id_x == 0 && id_y == 0);
|
|
+
|
|
+ id_batch *= dist;
|
|
+
|
|
+ if(id_x_y_zero)
|
|
+ set_imag_zero(id_batch, xreal, ximag);
|
|
+
|
|
+ if(Nxeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag);
|
|
+
|
|
+ if(Nyeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
|
|
+
|
|
+ if(Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag);
|
|
+
|
|
+ if(Nxeven && Nyeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
|
|
+
|
|
+ if(Nxeven && Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag);
|
|
+
|
|
+ if(Nyeven && Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag);
|
|
+
|
|
+ if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
|
|
+ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)),
|
|
+ id_batch + ystride * (id_y + 1),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
|
|
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)),
|
|
+ id_batch + xstride * (id_x + 1),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(Nyeven && id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(id_x < x_total && id_y < y_total)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(Nzeven)
|
|
+ {
|
|
+ if(id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(Nyeven && id_x < x_total && id_y == 0)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
|
|
+ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(Nxeven && id_x == 0 && id_y < y_total_half)
|
|
+ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
|
|
+ + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+
|
|
+ if(id_x < x_total && id_y < y_total)
|
|
+ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
|
|
+ + zstride * (Nz / 2),
|
|
+ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
|
|
+ + zstride * (Nz / 2),
|
|
+ xreal,
|
|
+ ximag);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+// get grid dimensions for data gen kernel
|
|
+static dim3 generate_data_gridDim(const size_t isize)
|
|
+{
|
|
+ auto blockSize = DATA_GEN_THREADS;
|
|
+ // total number of blocks needed in the grid
|
|
+ auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
|
|
+
|
|
+ // Total work items per dimension in the grid is counted in
|
|
+ // uint32_t. Since each thread initializes one element, very
|
|
+ // large amounts of data will overflow this total size if we do
|
|
+ // all this work in one grid dimension, causing launch failure.
|
|
+ //
|
|
+ // CUDA also generally allows for effectively unlimited grid X
|
|
+ // dim, but Y and Z are more limited.
|
|
+ auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup);
|
|
+ auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX);
|
|
+ return {gridDim_x, gridDim_y};
|
|
+}
|
|
+
|
|
+// get grid dimensions for hermitian symmetrizer kernel
|
|
+static dim3 generate_hermitian_gridDim(const std::vector<size_t>& length,
|
|
+ const size_t batch,
|
|
+ const size_t blockSize)
|
|
+{
|
|
+ dim3 gridDim;
|
|
+
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
|
|
+ break;
|
|
+ case 2:
|
|
+ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
|
|
+ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize));
|
|
+ break;
|
|
+ case 3:
|
|
+ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
|
|
+ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize),
|
|
+ DivRoundingUp<size_t>(length[1] - 1, blockSize));
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
|
|
+ }
|
|
+
|
|
+ return gridDim;
|
|
+}
|
|
+
|
|
+static dim3 generate_blockDim(const std::vector<size_t>& length, const size_t blockSize)
|
|
+{
|
|
+ dim3 blockDim;
|
|
+
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ blockDim = dim3(blockSize);
|
|
+ break;
|
|
+ case 2:
|
|
+ blockDim = dim3(blockSize, blockSize);
|
|
+ break;
|
|
+ case 3:
|
|
+ blockDim = dim3(blockSize, blockSize, blockSize);
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
|
|
+ }
|
|
+
|
|
+ return blockDim;
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+static void generate_random_interleaved_data(const Tint& whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint& whole_stride,
|
|
+ rocfft_complex<Treal>* input_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ auto input_length = get_input_val(whole_length);
|
|
+ auto zero_length = make_zero_length(input_length);
|
|
+ auto input_stride = get_input_val(whole_stride);
|
|
+
|
|
+ dim3 gridDim = generate_data_gridDim(isize);
|
|
+ dim3 blockDim{DATA_GEN_THREADS};
|
|
+
|
|
+ launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(
|
|
+ HIP_KERNEL_NAME(generate_random_interleaved_data_kernel<decltype(input_length), Treal>),
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0, // sharedMemBytes
|
|
+ 0, // stream
|
|
+ input_length,
|
|
+ zero_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ input_stride,
|
|
+ input_data);
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+static void generate_interleaved_data(const Tint& whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint& whole_stride,
|
|
+ const size_t nbatch,
|
|
+ rocfft_complex<Treal>* input_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ const auto input_length = get_input_val(whole_length);
|
|
+ const auto input_stride = get_input_val(whole_stride);
|
|
+ const auto unit_stride = make_unit_stride(input_length);
|
|
+
|
|
+ const auto inv_scale
|
|
+ = static_cast<Treal>(1.0)
|
|
+ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
|
|
+
|
|
+ dim3 gridDim = generate_data_gridDim(isize);
|
|
+ dim3 blockDim{DATA_GEN_THREADS};
|
|
+
|
|
+ launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(
|
|
+ HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>),
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0, // sharedMemBytes
|
|
+ 0, // stream
|
|
+ input_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ input_stride,
|
|
+ unit_stride,
|
|
+ inv_scale,
|
|
+ input_data);
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("generate_interleaved_data_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+static void generate_random_planar_data(const Tint& whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint& whole_stride,
|
|
+ Treal* real_data,
|
|
+ Treal* imag_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ const auto input_length = get_input_val(whole_length);
|
|
+ const auto zero_length = make_zero_length(input_length);
|
|
+ const auto input_stride = get_input_val(whole_stride);
|
|
+
|
|
+ dim3 gridDim = generate_data_gridDim(isize);
|
|
+ dim3 blockDim{DATA_GEN_THREADS};
|
|
+
|
|
+ launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(
|
|
+ HIP_KERNEL_NAME(generate_random_planar_data_kernel<decltype(input_length), Treal>),
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0, // sharedMemBytes
|
|
+ 0, // stream
|
|
+ input_length,
|
|
+ zero_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ input_stride,
|
|
+ real_data,
|
|
+ imag_data);
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("generate_random_planar_data_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+static void generate_planar_data(const Tint& whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint& whole_stride,
|
|
+ const size_t nbatch,
|
|
+ Treal* real_data,
|
|
+ Treal* imag_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ const auto input_length = get_input_val(whole_length);
|
|
+ const auto input_stride = get_input_val(whole_stride);
|
|
+ const auto unit_stride = make_unit_stride(input_length);
|
|
+
|
|
+ const auto inv_scale
|
|
+ = static_cast<Treal>(1.0)
|
|
+ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
|
|
+
|
|
+ dim3 gridDim = generate_data_gridDim(isize);
|
|
+ dim3 blockDim{DATA_GEN_THREADS};
|
|
+
|
|
+ launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>),
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0, // sharedMemBytes
|
|
+ 0, // stream
|
|
+ input_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ input_stride,
|
|
+ unit_stride,
|
|
+ inv_scale,
|
|
+ real_data,
|
|
+ imag_data);
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("generate_planar_data_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+static void generate_random_real_data(const Tint& whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint& whole_stride,
|
|
+ Treal* input_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ const auto input_length = get_input_val(whole_length);
|
|
+ const auto zero_length = make_zero_length(input_length);
|
|
+ const auto input_stride = get_input_val(whole_stride);
|
|
+
|
|
+ dim3 gridDim = generate_data_gridDim(isize);
|
|
+ dim3 blockDim{DATA_GEN_THREADS};
|
|
+
|
|
+ launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(
|
|
+ HIP_KERNEL_NAME(generate_random_real_data_kernel<decltype(input_length), Treal>),
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0, // sharedMemBytes
|
|
+ 0, // stream
|
|
+ input_length,
|
|
+ zero_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ input_stride,
|
|
+ input_data);
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("generate_random_real_data_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tint, typename Treal>
|
|
+static void generate_real_data(const Tint& whole_length,
|
|
+ const size_t idist,
|
|
+ const size_t isize,
|
|
+ const Tint& whole_stride,
|
|
+ const size_t nbatch,
|
|
+ Treal* input_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ const auto input_length = get_input_val(whole_length);
|
|
+ const auto input_stride = get_input_val(whole_stride);
|
|
+ const auto unit_stride = make_unit_stride(input_length);
|
|
+
|
|
+ const auto inv_scale
|
|
+ = static_cast<Treal>(1.0)
|
|
+ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
|
|
+
|
|
+ dim3 gridDim = generate_data_gridDim(isize);
|
|
+ dim3 blockDim{DATA_GEN_THREADS};
|
|
+
|
|
+ launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>),
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0, // sharedMemBytes
|
|
+ 0, // stream
|
|
+ input_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ input_stride,
|
|
+ unit_stride,
|
|
+ inv_scale,
|
|
+ input_data);
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("generate_real_data_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tcomplex>
|
|
+static void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& ilength,
|
|
+ const std::vector<size_t>& stride,
|
|
+ const size_t dist,
|
|
+ const size_t batch,
|
|
+ Tcomplex* input_data,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ auto blockSize = DATA_GEN_THREADS;
|
|
+ auto blockDim = generate_blockDim(length, blockSize);
|
|
+ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize);
|
|
+
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ {
|
|
+ launch_limits_check(
|
|
+ "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel<Tcomplex>,
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0,
|
|
+ 0,
|
|
+ input_data,
|
|
+ length[0],
|
|
+ stride[0],
|
|
+ dist,
|
|
+ batch,
|
|
+ length[0] % 2 == 0);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case 2:
|
|
+ {
|
|
+ launch_limits_check(
|
|
+ "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel<Tcomplex>,
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0,
|
|
+ 0,
|
|
+ input_data,
|
|
+ length[0],
|
|
+ length[1],
|
|
+ stride[0],
|
|
+ stride[1],
|
|
+ dist,
|
|
+ batch,
|
|
+ (ilength[0] + 1) / 2 - 1,
|
|
+ length[0] % 2 == 0,
|
|
+ length[1] % 2 == 0);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case 3:
|
|
+ {
|
|
+ launch_limits_check(
|
|
+ "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel<Tcomplex>,
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0,
|
|
+ 0,
|
|
+ input_data,
|
|
+ length[0],
|
|
+ length[1],
|
|
+ length[2],
|
|
+ stride[0],
|
|
+ stride[1],
|
|
+ stride[2],
|
|
+ dist,
|
|
+ batch,
|
|
+ (ilength[0] + 1) / 2 - 1,
|
|
+ ilength[1] - 1,
|
|
+ (ilength[1] + 1) / 2 - 1,
|
|
+ length[0] % 2 == 0,
|
|
+ length[1] % 2 == 0,
|
|
+ length[2] % 2 == 0);
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
|
|
+ }
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+static void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& ilength,
|
|
+ const std::vector<size_t>& stride,
|
|
+ const size_t dist,
|
|
+ const size_t batch,
|
|
+ Tfloat* input_data_real,
|
|
+ Tfloat* input_data_imag,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ auto blockSize = DATA_GEN_THREADS;
|
|
+ auto blockDim = generate_blockDim(length, blockSize);
|
|
+ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize);
|
|
+
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ {
|
|
+ launch_limits_check(
|
|
+ "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel<Tfloat>,
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0,
|
|
+ 0,
|
|
+ input_data_real,
|
|
+ input_data_imag,
|
|
+ length[0],
|
|
+ stride[0],
|
|
+ dist,
|
|
+ batch,
|
|
+ length[0] % 2 == 0);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case 2:
|
|
+ {
|
|
+ launch_limits_check(
|
|
+ "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel<Tfloat>,
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0,
|
|
+ 0,
|
|
+ input_data_real,
|
|
+ input_data_imag,
|
|
+ length[0],
|
|
+ length[1],
|
|
+ stride[0],
|
|
+ stride[1],
|
|
+ dist,
|
|
+ batch,
|
|
+ (ilength[0] + 1) / 2 - 1,
|
|
+ length[0] % 2 == 0,
|
|
+ length[1] % 2 == 0);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case 3:
|
|
+ {
|
|
+ launch_limits_check(
|
|
+ "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp);
|
|
+
|
|
+ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel<Tfloat>,
|
|
+ gridDim,
|
|
+ blockDim,
|
|
+ 0,
|
|
+ 0,
|
|
+ input_data_real,
|
|
+ input_data_imag,
|
|
+ length[0],
|
|
+ length[1],
|
|
+ length[2],
|
|
+ stride[0],
|
|
+ stride[1],
|
|
+ stride[2],
|
|
+ dist,
|
|
+ batch,
|
|
+ (ilength[0] + 1) / 2 - 1,
|
|
+ ilength[1] - 1,
|
|
+ (ilength[1] + 1) / 2 - 1,
|
|
+ length[0] % 2 == 0,
|
|
+ length[1] % 2 == 0,
|
|
+ length[2] % 2 == 0);
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
|
|
+ }
|
|
+ auto err = hipGetLastError();
|
|
+ if(err != hipSuccess)
|
|
+ throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: "
|
|
+ + std::string(hipGetErrorName(err)));
|
|
+}
|
|
+
|
|
+#endif // DATA_GEN_DEVICE_H
|
|
diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h
|
|
new file mode 100644
|
|
index 0000000..29d3854
|
|
--- /dev/null
|
|
+++ b/shared/data_gen_host.h
|
|
@@ -0,0 +1,881 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef DATA_GEN_HOST_H
|
|
+#define DATA_GEN_HOST_H
|
|
+
|
|
+#include "../shared/hostbuf.h"
|
|
+#include "../shared/increment.h"
|
|
+#include <complex>
|
|
+#include <limits>
|
|
+#include <random>
|
|
+#include <tuple>
|
|
+#include <vector>
|
|
+
|
|
+// Specialized computation of index given 1-, 2-, 3- dimension length + stride
|
|
+template <typename T1, typename T2>
|
|
+size_t compute_index(T1 length, T2 stride, size_t base)
|
|
+{
|
|
+ return (length * stride) + base;
|
|
+}
|
|
+
|
|
+template <typename T1, typename T2>
|
|
+size_t
|
|
+ compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
|
|
+{
|
|
+ static_assert(std::is_integral<T1>::value, "Integral required.");
|
|
+ static_assert(std::is_integral<T2>::value, "Integral required.");
|
|
+ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
|
|
+ + base;
|
|
+}
|
|
+
|
|
+template <typename T1, typename T2>
|
|
+size_t compute_index(const std::tuple<T1, T1, T1>& length,
|
|
+ const std::tuple<T2, T2, T2>& stride,
|
|
+ size_t base)
|
|
+{
|
|
+ static_assert(std::is_integral<T1>::value, "Integral required.");
|
|
+ static_assert(std::is_integral<T2>::value, "Integral required.");
|
|
+ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
|
|
+ + (std::get<2>(length) * std::get<2>(stride)) + base;
|
|
+}
|
|
+
|
|
+// count the number of total iterations for 1-, 2-, and 3-D dimensions
|
|
+template <typename T1>
|
|
+size_t count_iters(const T1& i)
|
|
+{
|
|
+ return i;
|
|
+}
|
|
+
|
|
+template <typename T1>
|
|
+size_t count_iters(const std::tuple<T1, T1>& i)
|
|
+{
|
|
+ return std::get<0>(i) * std::get<1>(i);
|
|
+}
|
|
+
|
|
+template <typename T1>
|
|
+size_t count_iters(const std::tuple<T1, T1, T1>& i)
|
|
+{
|
|
+ return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
|
|
+}
|
|
+
|
|
+template <typename T1>
|
|
+T1 make_unit_stride(const T1& whole_length)
|
|
+{
|
|
+ return static_cast<T1>(1);
|
|
+}
|
|
+
|
|
+template <typename T1>
|
|
+std::tuple<T1, T1> make_unit_stride(const std::tuple<T1, T1>& whole_length)
|
|
+{
|
|
+ return std::make_tuple(static_cast<T1>(1), static_cast<T1>(std::get<0>(whole_length)));
|
|
+}
|
|
+
|
|
+template <typename T1>
|
|
+std::tuple<T1, T1, T1> make_unit_stride(const std::tuple<T1, T1, T1>& whole_length)
|
|
+{
|
|
+ return std::make_tuple(static_cast<T1>(1),
|
|
+ static_cast<T1>(std::get<0>(whole_length)),
|
|
+ static_cast<T1>(std::get<0>(whole_length))
|
|
+ * static_cast<T1>(std::get<1>(whole_length)));
|
|
+}
|
|
+
|
|
+// Work out how many partitions to break our iteration problem into
|
|
+template <typename T1>
|
|
+static size_t compute_partition_count(T1 length)
|
|
+{
|
|
+#ifdef _OPENMP
|
|
+ // we seem to get contention from too many threads, which slows
|
|
+ // things down. particularly noticeable with mix_3D tests
|
|
+ static const size_t MAX_PARTITIONS = 8;
|
|
+ size_t iters = count_iters(length);
|
|
+ size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
|
|
+ if(!hw_threads)
|
|
+ return 1;
|
|
+
|
|
+ // don't bother threading problem sizes that are too small. pick
|
|
+ // an arbitrary number of iterations and ensure that each thread
|
|
+ // has at least that many iterations to process
|
|
+ static const size_t MIN_ITERS_PER_THREAD = 2048;
|
|
+
|
|
+ // either use the whole CPU, or use ceil(iters/iters_per_thread)
|
|
+ return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
|
|
+#else
|
|
+ return 1;
|
|
+#endif
|
|
+}
|
|
+
|
|
+// Break a scalar length into some number of pieces, returning
|
|
+// [(start0, end0), (start1, end1), ...]
|
|
+template <typename T1>
|
|
+std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
|
|
+{
|
|
+ static_assert(std::is_integral<T1>::value, "Integral required.");
|
|
+
|
|
+ // make sure we don't exceed the length
|
|
+ num_parts = std::min(length, num_parts);
|
|
+
|
|
+ std::vector<std::pair<T1, T1>> ret(num_parts);
|
|
+ auto partition_size = length / num_parts;
|
|
+ T1 cur_partition = 0;
|
|
+ for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
|
|
+ {
|
|
+ ret[i].first = cur_partition;
|
|
+ ret[i].second = cur_partition + partition_size;
|
|
+ }
|
|
+ // last partition might not divide evenly, fix it up
|
|
+ ret.back().second = length;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
|
|
+template <typename T1>
|
|
+std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
|
|
+{
|
|
+ return partition_base(length, compute_partition_count(length));
|
|
+}
|
|
+
|
|
+// Partition on the leftmost part of the tuple, for row-major indexing
|
|
+template <typename T1>
|
|
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
|
|
+ partition_rowmajor(const std::tuple<T1, T1>& length)
|
|
+{
|
|
+ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
|
|
+ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
|
|
+ for(size_t i = 0; i < partitions.size(); ++i)
|
|
+ {
|
|
+ std::get<0>(ret[i].first) = partitions[i].first;
|
|
+ std::get<1>(ret[i].first) = 0;
|
|
+ std::get<0>(ret[i].second) = partitions[i].second;
|
|
+ std::get<1>(ret[i].second) = std::get<1>(length);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+template <typename T1>
|
|
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
|
|
+ partition_rowmajor(const std::tuple<T1, T1, T1>& length)
|
|
+{
|
|
+ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
|
|
+ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
|
|
+ for(size_t i = 0; i < partitions.size(); ++i)
|
|
+ {
|
|
+ std::get<0>(ret[i].first) = partitions[i].first;
|
|
+ std::get<1>(ret[i].first) = 0;
|
|
+ std::get<2>(ret[i].first) = 0;
|
|
+ std::get<0>(ret[i].second) = partitions[i].second;
|
|
+ std::get<1>(ret[i].second) = std::get<1>(length);
|
|
+ std::get<2>(ret[i].second) = std::get<2>(length);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
|
|
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
|
|
+// space. For multi-dimensional data, this means that we only need to store a bit more
|
|
+// than half of the complex values; the rest are redundant. However, there are still
|
|
+// some restrictions:
|
|
+// * the origin and Nyquist value(s) must be real-valued
|
|
+// * some of the remaining values are still redundant, and you might get different results
|
|
+// than you expect if the values don't agree.
|
|
+// Below are some example kernels which impose Hermitian symmetry on a complex array
|
|
+// of the given dimensions.
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_interleaved_1D(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
|
|
+ {
|
|
+ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
|
|
+
|
|
+ data[0].imag(0.0);
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2)].imag(0.0);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_planar_1D(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
|
|
+ {
|
|
+ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
|
|
+
|
|
+ data_imag[0] = 0.0;
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2)] = 0.0;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_interleaved_2D(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
|
|
+ {
|
|
+ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
|
|
+
|
|
+ data[0].imag(0.0);
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ data[istride[1] * (length[1] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
|
|
+ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_planar_2D(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
|
|
+ {
|
|
+ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
|
|
+ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
|
|
+
|
|
+ data_imag[0] = 0.0;
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[1] * (length[1] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
|
|
+ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
|
|
+ = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
|
|
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
|
|
+ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_interleaved_3D(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
|
|
+ {
|
|
+ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
|
|
+
|
|
+ data[0].imag(0.0);
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ data[istride[1] * (length[1] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[2] % 2 == 0)
|
|
+ {
|
|
+ data[istride[2] * (length[2] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[2] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
|
|
+ }
|
|
+ if(length[1] % 2 == 0 && length[2] % 2 == 0)
|
|
+ {
|
|
+ data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ .imag(0.0);
|
|
+ }
|
|
+
|
|
+ // y-axis:
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]);
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ // y-axis at x-nyquist
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
|
|
+ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // x-axis:
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ // x-axis at y-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
|
|
+ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // x-y plane:
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ for(unsigned int j = 1; j < length[1]; ++j)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
|
|
+ = std::conj(data[istride[0] * i + istride[1] * j]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(length[2] % 2 == 0)
|
|
+ {
|
|
+ // x-axis at z-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
|
|
+ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
|
|
+ }
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ // x-axis at yz-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
|
|
+ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // y-axis: at z-nyquist
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
|
|
+ = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]);
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ // y-axis: at xz-nyquist
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j
|
|
+ + istride[2] * (length[2] / 2)]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // x-y plane: at z-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ for(unsigned int j = 1; j < length[1]; ++j)
|
|
+ {
|
|
+ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = std::conj(
|
|
+ data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_planar_3D(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
|
|
+ {
|
|
+ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
|
|
+ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
|
|
+
|
|
+ data_imag[0] = 0.0;
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[1] * (length[1] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[2] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[2] * (length[2] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[1] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[2] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0;
|
|
+ }
|
|
+ if(length[1] % 2 == 0 && length[2] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0;
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
|
|
+ {
|
|
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = 0.0;
|
|
+ }
|
|
+
|
|
+ // y-axis:
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j];
|
|
+ data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j];
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ // y-axis at x-nyquist
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
|
|
+ = data_real[istride[0] * (length[0] / 2) + istride[1] * j];
|
|
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
|
|
+ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // x-axis:
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
|
|
+ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
|
|
+ }
|
|
+
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ // x-axis at y-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
|
|
+ = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
|
|
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
|
|
+ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // x-y plane:
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ for(unsigned int j = 1; j < length[1]; ++j)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
|
|
+ = data_real[istride[0] * i + istride[1] * j];
|
|
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
|
|
+ = -data_imag[istride[0] * i + istride[1] * j];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(length[2] % 2 == 0)
|
|
+ {
|
|
+ // x-axis at z-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
|
|
+ = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
|
|
+ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
|
|
+ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
|
|
+ }
|
|
+ if(length[1] % 2 == 0)
|
|
+ {
|
|
+ // x-axis at yz-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
|
|
+ = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
|
|
+ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
|
|
+ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // y-axis: at z-nyquist
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
|
|
+ = data_real[istride[1] * j + istride[2] * (length[2] / 2)];
|
|
+ data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
|
|
+ = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)];
|
|
+ }
|
|
+
|
|
+ if(length[0] % 2 == 0)
|
|
+ {
|
|
+ // y-axis: at xz-nyquist
|
|
+ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = data_real[istride[0] * (length[0] / 2) + istride[1] * j
|
|
+ + istride[2] * (length[2] / 2)];
|
|
+ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j
|
|
+ + istride[2] * (length[2] / 2)];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // x-y plane: at z-nyquist
|
|
+ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
|
|
+ {
|
|
+ for(unsigned int j = 1; j < length[1]; ++j)
|
|
+ {
|
|
+ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)];
|
|
+ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
|
|
+ + istride[2] * (length[2] / 2)]
|
|
+ = -data_imag[istride[0] * i + istride[1] * j
|
|
+ + istride[2] * (length[2] / 2)];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+static void generate_random_interleaved_data(std::vector<hostbuf>& input,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch)
|
|
+{
|
|
+ auto idata = (std::complex<Tfloat>*)input[0].data();
|
|
+ size_t i_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
|
|
+ {
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ std::mt19937 gen(compute_index(index, whole_stride, i_base));
|
|
+ do
|
|
+ {
|
|
+ const auto i = compute_index(index, whole_stride, i_base);
|
|
+ const Tfloat x = (Tfloat)gen() / (Tfloat)gen.max();
|
|
+ const Tfloat y = (Tfloat)gen() / (Tfloat)gen.max();
|
|
+ const std::complex<Tfloat> val(x, y);
|
|
+ idata[i] = val;
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+static void generate_interleaved_data(std::vector<hostbuf>& input,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch)
|
|
+{
|
|
+ auto idata = (std::complex<Tfloat>*)input[0].data();
|
|
+ size_t i_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ auto unit_stride = make_unit_stride(whole_length);
|
|
+
|
|
+ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
|
|
+
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
|
|
+ {
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto val_xy
|
|
+ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
|
|
+
|
|
+ const std::complex<Tfloat> val(val_xy, val_xy);
|
|
+
|
|
+ const auto i = compute_index(index, whole_stride, i_base);
|
|
+
|
|
+ idata[i] = val;
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+static void generate_random_planar_data(std::vector<hostbuf>& input,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch)
|
|
+{
|
|
+ auto ireal = (Tfloat*)input[0].data();
|
|
+ auto iimag = (Tfloat*)input[1].data();
|
|
+ size_t i_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
|
|
+ {
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ std::mt19937 gen(compute_index(index, whole_stride, i_base));
|
|
+ do
|
|
+ {
|
|
+ const auto i = compute_index(index, whole_stride, i_base);
|
|
+ const std::complex<Tfloat> val((Tfloat)gen() / (Tfloat)gen.max(),
|
|
+ (Tfloat)gen() / (Tfloat)gen.max());
|
|
+ ireal[i] = val.real();
|
|
+ iimag[i] = val.imag();
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+static void generate_planar_data(std::vector<hostbuf>& input,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch)
|
|
+{
|
|
+
|
|
+ auto ireal = (Tfloat*)input[0].data();
|
|
+ auto iimag = (Tfloat*)input[1].data();
|
|
+ size_t i_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ auto unit_stride = make_unit_stride(whole_length);
|
|
+
|
|
+ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
|
|
+
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
|
|
+ {
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto val_xy
|
|
+ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
|
|
+
|
|
+ const auto i = compute_index(index, whole_stride, i_base);
|
|
+
|
|
+ ireal[i] = val_xy;
|
|
+ iimag[i] = val_xy;
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+static void generate_random_real_data(std::vector<hostbuf>& input,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch)
|
|
+{
|
|
+ auto idata = (Tfloat*)input[0].data();
|
|
+ size_t i_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
|
|
+ {
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ std::mt19937 gen(compute_index(index, whole_stride, i_base));
|
|
+ do
|
|
+ {
|
|
+ const auto i = compute_index(index, whole_stride, i_base);
|
|
+ const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max();
|
|
+ idata[i] = val;
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+static void generate_real_data(std::vector<hostbuf>& input,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch)
|
|
+{
|
|
+
|
|
+ auto idata = (Tfloat*)input[0].data();
|
|
+ size_t i_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ auto unit_stride = make_unit_stride(whole_length);
|
|
+
|
|
+ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
|
|
+
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
|
|
+ {
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto i = compute_index(index, whole_stride, i_base);
|
|
+
|
|
+ idata[i]
|
|
+ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_interleaved(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ impose_hermitian_symmetry_interleaved_1D<Tfloat>(vals, length, istride, idist, nbatch);
|
|
+ break;
|
|
+ case 2:
|
|
+ impose_hermitian_symmetry_interleaved_2D<Tfloat>(vals, length, istride, idist, nbatch);
|
|
+ break;
|
|
+ case 3:
|
|
+ impose_hermitian_symmetry_interleaved_3D<Tfloat>(vals, length, istride, idist, nbatch);
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tsize>
|
|
+static void impose_hermitian_symmetry_planar(std::vector<hostbuf>& vals,
|
|
+ const std::vector<Tsize>& length,
|
|
+ const std::vector<Tsize>& istride,
|
|
+ const Tsize idist,
|
|
+ const Tsize nbatch)
|
|
+{
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ impose_hermitian_symmetry_planar_1D<Tfloat>(vals, length, istride, idist, nbatch);
|
|
+ break;
|
|
+ case 2:
|
|
+ impose_hermitian_symmetry_planar_2D<Tfloat>(vals, length, istride, idist, nbatch);
|
|
+ break;
|
|
+ case 3:
|
|
+ impose_hermitian_symmetry_planar_3D<Tfloat>(vals, length, istride, idist, nbatch);
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif // DATA_GEN_HOST_H
|
|
diff --git a/shared/device_properties.h b/shared/device_properties.h
|
|
new file mode 100644
|
|
index 0000000..6e2e1e1
|
|
--- /dev/null
|
|
+++ b/shared/device_properties.h
|
|
@@ -0,0 +1,74 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_DEVICE_PROPS_H
|
|
+#define ROCFFT_DEVICE_PROPS_H
|
|
+
|
|
+#include <cstdint>
|
|
+#include <hip/hip_runtime_api.h>
|
|
+#include <stdexcept>
|
|
+
|
|
+// get device properties
|
|
+static hipDeviceProp_t get_curr_device_prop()
|
|
+{
|
|
+ hipDeviceProp_t prop;
|
|
+ int deviceId = 0;
|
|
+ if(hipGetDevice(&deviceId) != hipSuccess)
|
|
+ throw std::runtime_error("hipGetDevice failed.");
|
|
+
|
|
+ if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
|
|
+ throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
|
|
+ + std::to_string(deviceId));
|
|
+
|
|
+ return prop;
|
|
+}
|
|
+
|
|
+// check that the given grid/block dims will fit into the limits in
|
|
+// the device properties. throws std::runtime_error if the limits
|
|
+// are exceeded.
|
|
+static void launch_limits_check(const std::string& kernel_name,
|
|
+ const dim3 gridDim,
|
|
+ const dim3 blockDim,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ // Need lots of casting here because dim3 is unsigned but device
|
|
+ // props are signed. Cast direct comparisons to fix signedness
|
|
+ // issues. Promote types to 64-bit when multiplying to try to
|
|
+ // avoid overflow.
|
|
+
|
|
+ // Block limits along each dimension
|
|
+ if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0])
|
|
+ || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1])
|
|
+ || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2]))
|
|
+ throw std::runtime_error("max threads per dim exceeded: " + kernel_name);
|
|
+
|
|
+ // Total threads for the whole block
|
|
+ if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z
|
|
+ > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock))
|
|
+ throw std::runtime_error("max threads per block exceeded: " + kernel_name);
|
|
+
|
|
+ // Grid dimension limits
|
|
+ if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0])
|
|
+ || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1])
|
|
+ || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2]))
|
|
+ throw std::runtime_error("max grid size exceeded: " + kernel_name);
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h
|
|
new file mode 100644
|
|
index 0000000..1c2fba0
|
|
--- /dev/null
|
|
+++ b/shared/enum_to_string.h
|
|
@@ -0,0 +1,81 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ENUM_TO_STRING_H
|
|
+#define ENUM_TO_STRING_H
|
|
+
|
|
+#include "fft_params.h"
|
|
+
|
|
+// Return the string of the hipError code.
|
|
+static std::string hipError_to_string(const hipError_t ret)
|
|
+{
|
|
+ switch(ret)
|
|
+ {
|
|
+ case hipSuccess:
|
|
+ return "hipSuccess";
|
|
+ case hipErrorInvalidContext:
|
|
+ return "hipErrorInvalidContext";
|
|
+ case hipErrorInvalidKernelFile:
|
|
+ return "hipErrorInvalidKernelFile";
|
|
+ case hipErrorMemoryAllocation:
|
|
+ return "hipErrorMemoryAllocation";
|
|
+ case hipErrorInitializationError:
|
|
+ return "hipErrorInitializationError";
|
|
+ case hipErrorLaunchFailure:
|
|
+ return "hipErrorLaunchFailure";
|
|
+ case hipErrorLaunchOutOfResources:
|
|
+ return "hipErrorLaunchOutOfResources";
|
|
+ case hipErrorInvalidDevice:
|
|
+ return "hipErrorInvalidDevice";
|
|
+ case hipErrorInvalidValue:
|
|
+ return "hipErrorInvalidValue";
|
|
+ case hipErrorInvalidDevicePointer:
|
|
+ return "hipErrorInvalidDevicePointer";
|
|
+ case hipErrorInvalidMemcpyDirection:
|
|
+ return "hipErrorInvalidMemcpyDirection";
|
|
+ case hipErrorUnknown:
|
|
+ return "hipErrorUnknown";
|
|
+ case hipErrorInvalidResourceHandle:
|
|
+ return "hipErrorInvalidResourceHandle";
|
|
+ case hipErrorNotReady:
|
|
+ return "hipErrorNotReady";
|
|
+ case hipErrorNoDevice:
|
|
+ return "hipErrorNoDevice";
|
|
+ case hipErrorPeerAccessAlreadyEnabled:
|
|
+ return "hipErrorPeerAccessAlreadyEnabled";
|
|
+ case hipErrorPeerAccessNotEnabled:
|
|
+ return "hipErrorPeerAccessNotEnabled";
|
|
+ case hipErrorRuntimeMemory:
|
|
+ return "hipErrorRuntimeMemory";
|
|
+ case hipErrorRuntimeOther:
|
|
+ return "hipErrorRuntimeOther";
|
|
+ case hipErrorHostMemoryAlreadyRegistered:
|
|
+ return "hipErrorHostMemoryAlreadyRegistered";
|
|
+ case hipErrorHostMemoryNotRegistered:
|
|
+ return "hipErrorHostMemoryNotRegistered";
|
|
+ case hipErrorMapBufferObjectFailed:
|
|
+ return "hipErrorMapBufferObjectFailed";
|
|
+ case hipErrorTbd:
|
|
+ return "hipErrorTbd";
|
|
+ default:
|
|
+ throw std::runtime_error("unknown hipError");
|
|
+ }
|
|
+}
|
|
+#endif
|
|
diff --git a/shared/environment.h b/shared/environment.h
|
|
new file mode 100644
|
|
index 0000000..7be56a0
|
|
--- /dev/null
|
|
+++ b/shared/environment.h
|
|
@@ -0,0 +1,97 @@
|
|
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+// wrappers around environment variable routines
|
|
+
|
|
+#pragma once
|
|
+
|
|
+#include <string>
|
|
+
|
|
+// Windows provides "getenv" and "_putenv", but those modify the
|
|
+// runtime's copy of the environment. The actual environment in the
|
|
+// process control block is accessed using GetEnvironmentVariable and
|
|
+// SetEnvironmentVariable.
|
|
+
|
|
+#ifdef WIN32
|
|
+#include <windows.h>
|
|
+static void rocfft_setenv(const char* var, const char* value)
|
|
+{
|
|
+ SetEnvironmentVariable(var, value);
|
|
+}
|
|
+static void rocfft_unsetenv(const char* var)
|
|
+{
|
|
+ SetEnvironmentVariable(var, nullptr);
|
|
+}
|
|
+static std::string rocfft_getenv(const char* var)
|
|
+{
|
|
+ DWORD size = GetEnvironmentVariable(var, nullptr, 0);
|
|
+ std::string ret;
|
|
+ if(size)
|
|
+ {
|
|
+ ret.resize(size);
|
|
+ GetEnvironmentVariable(var, ret.data(), size);
|
|
+ // GetEnvironmentVariable counts the terminating null, so remove it
|
|
+ while(!ret.empty() && ret.back() == 0)
|
|
+ ret.pop_back();
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+#include <stdlib.h>
|
|
+
|
|
+static void rocfft_setenv(const char* var, const char* value)
|
|
+{
|
|
+ setenv(var, value, 1);
|
|
+}
|
|
+static void rocfft_unsetenv(const char* var)
|
|
+{
|
|
+ unsetenv(var);
|
|
+}
|
|
+static std::string rocfft_getenv(const char* var)
|
|
+{
|
|
+ auto value = getenv(var);
|
|
+ return value ? value : "";
|
|
+}
|
|
+#endif
|
|
+
|
|
+// RAII object to set an environment variable and restore it to its
|
|
+// previous value on destruction
|
|
+struct EnvironmentSetTemp
|
|
+{
|
|
+ EnvironmentSetTemp(const char* _var, const char* val)
|
|
+ : var(_var)
|
|
+ {
|
|
+ auto val_ptr = rocfft_getenv(_var);
|
|
+ if(!val_ptr.empty())
|
|
+ oldvalue = val_ptr;
|
|
+ rocfft_setenv(_var, val);
|
|
+ }
|
|
+ ~EnvironmentSetTemp()
|
|
+ {
|
|
+ if(oldvalue.empty())
|
|
+ rocfft_unsetenv(var.c_str());
|
|
+ else
|
|
+ rocfft_setenv(var.c_str(), oldvalue.c_str());
|
|
+ }
|
|
+ std::string var;
|
|
+ std::string oldvalue;
|
|
+};
|
|
diff --git a/shared/fft_params.h b/shared/fft_params.h
|
|
new file mode 100644
|
|
index 0000000..bf428ef
|
|
--- /dev/null
|
|
+++ b/shared/fft_params.h
|
|
@@ -0,0 +1,3274 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef FFT_PARAMS_H
|
|
+#define FFT_PARAMS_H
|
|
+
|
|
+#include <algorithm>
|
|
+#include <hip/hip_runtime.h>
|
|
+#include <iostream>
|
|
+#include <mutex>
|
|
+#include <numeric>
|
|
+#include <sstream>
|
|
+#ifdef _OPENMP
|
|
+#include <omp.h>
|
|
+#endif
|
|
+#include <random>
|
|
+#include <tuple>
|
|
+#include <unordered_set>
|
|
+#include <vector>
|
|
+
|
|
+#include "../shared/arithmetic.h"
|
|
+#include "../shared/array_validator.h"
|
|
+#include "../shared/data_gen_device.h"
|
|
+#include "../shared/data_gen_host.h"
|
|
+#include "../shared/device_properties.h"
|
|
+#include "../shared/printbuffer.h"
|
|
+#include "../shared/ptrdiff.h"
|
|
+
|
|
+enum fft_status
|
|
+{
|
|
+ fft_status_success,
|
|
+ fft_status_failure,
|
|
+ fft_status_invalid_arg_value,
|
|
+ fft_status_invalid_dimensions,
|
|
+ fft_status_invalid_array_type,
|
|
+ fft_status_invalid_strides,
|
|
+ fft_status_invalid_distance,
|
|
+ fft_status_invalid_offset,
|
|
+ fft_status_invalid_work_buffer,
|
|
+};
|
|
+
|
|
+enum fft_transform_type
|
|
+{
|
|
+ fft_transform_type_complex_forward,
|
|
+ fft_transform_type_complex_inverse,
|
|
+ fft_transform_type_real_forward,
|
|
+ fft_transform_type_real_inverse,
|
|
+};
|
|
+
|
|
+enum fft_precision
|
|
+{
|
|
+ fft_precision_half,
|
|
+ fft_precision_single,
|
|
+ fft_precision_double,
|
|
+};
|
|
+
|
|
+static std::istream& operator>>(std::istream& str, fft_precision& precision)
|
|
+{
|
|
+ std::string word;
|
|
+ str >> word;
|
|
+
|
|
+ if(word == "half")
|
|
+ precision = fft_precision_half;
|
|
+ else if(word == "single")
|
|
+ precision = fft_precision_single;
|
|
+ else if(word == "double")
|
|
+ precision = fft_precision_double;
|
|
+ else
|
|
+ throw std::runtime_error("Invalid precision specified");
|
|
+ return str;
|
|
+}
|
|
+
|
|
+// fft_input_generator: linearly spaced sequence in [-0.5,0.5]
|
|
+// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5]
|
|
+enum fft_input_generator
|
|
+{
|
|
+ fft_input_random_generator_device,
|
|
+ fft_input_random_generator_host,
|
|
+ fft_input_generator_device,
|
|
+ fft_input_generator_host,
|
|
+};
|
|
+
|
|
+static std::istream& operator>>(std::istream& str, fft_input_generator& gen)
|
|
+{
|
|
+ std::string word;
|
|
+ str >> word;
|
|
+
|
|
+ if(word == "0")
|
|
+ gen = fft_input_random_generator_device;
|
|
+ else if(word == "1")
|
|
+ gen = fft_input_random_generator_host;
|
|
+ else if(word == "2")
|
|
+ gen = fft_input_generator_device;
|
|
+ else if(word == "3")
|
|
+ gen = fft_input_generator_host;
|
|
+ else
|
|
+ throw std::runtime_error("Invalid input generator specified");
|
|
+ return str;
|
|
+}
|
|
+
|
|
+enum fft_array_type
|
|
+{
|
|
+ fft_array_type_complex_interleaved,
|
|
+ fft_array_type_complex_planar,
|
|
+ fft_array_type_real,
|
|
+ fft_array_type_hermitian_interleaved,
|
|
+ fft_array_type_hermitian_planar,
|
|
+ fft_array_type_unset,
|
|
+};
|
|
+
|
|
+enum fft_result_placement
|
|
+{
|
|
+ fft_placement_inplace,
|
|
+ fft_placement_notinplace,
|
|
+};
|
|
+
|
|
+// Determine the size of the data type given the precision and type.
|
|
+template <typename Tsize>
|
|
+inline Tsize var_size(const fft_precision precision, const fft_array_type type)
|
|
+{
|
|
+ size_t var_size = 0;
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ var_size = sizeof(_Float16);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ var_size = sizeof(float);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ var_size = sizeof(double);
|
|
+ break;
|
|
+ }
|
|
+ switch(type)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ var_size *= 2;
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ return var_size;
|
|
+}
|
|
+// Given an array type and transform length, strides, etc, load random floats in [0,1]
|
|
+// into the input array of floats/doubles or complex floats/doubles gpu buffers.
|
|
+template <typename Tfloat, typename Tint1>
|
|
+inline void set_input(std::vector<gpubuf>& input,
|
|
+ const fft_input_generator igen,
|
|
+ const fft_array_type itype,
|
|
+ const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& ilength,
|
|
+ const std::vector<size_t>& istride,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ auto isize = count_iters(whole_length) * nbatch;
|
|
+
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ {
|
|
+ auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data();
|
|
+
|
|
+ if(igen == fft_input_generator_device)
|
|
+ generate_interleaved_data(
|
|
+ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
|
|
+ else if(igen == fft_input_random_generator_device)
|
|
+ generate_random_interleaved_data(
|
|
+ whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
|
|
+
|
|
+ if(itype == fft_array_type_hermitian_interleaved)
|
|
+ {
|
|
+ auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data();
|
|
+ impose_hermitian_symmetry_interleaved(
|
|
+ length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp);
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ {
|
|
+ auto ibuffer_real = (Tfloat*)input[0].data();
|
|
+ auto ibuffer_imag = (Tfloat*)input[1].data();
|
|
+
|
|
+ if(igen == fft_input_generator_device)
|
|
+ generate_planar_data(whole_length,
|
|
+ idist,
|
|
+ isize,
|
|
+ whole_stride,
|
|
+ nbatch,
|
|
+ ibuffer_real,
|
|
+ ibuffer_imag,
|
|
+ deviceProp);
|
|
+ else if(igen == fft_input_random_generator_device)
|
|
+ generate_random_planar_data(
|
|
+ whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp);
|
|
+
|
|
+ if(itype == fft_array_type_hermitian_planar)
|
|
+ impose_hermitian_symmetry_planar(
|
|
+ length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_real:
|
|
+ {
|
|
+ auto ibuffer = (Tfloat*)input[0].data();
|
|
+
|
|
+ if(igen == fft_input_generator_device)
|
|
+ generate_real_data(
|
|
+ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
|
|
+ else if(igen == fft_input_random_generator_device)
|
|
+ generate_random_real_data(
|
|
+ whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ throw std::runtime_error("Input layout format not yet supported");
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Tfloat, typename Tint1>
|
|
+inline void set_input(std::vector<hostbuf>& input,
|
|
+ const fft_input_generator igen,
|
|
+ const fft_array_type itype,
|
|
+ const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& ilength,
|
|
+ const std::vector<size_t>& istride,
|
|
+ const Tint1& whole_length,
|
|
+ const Tint1& whole_stride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ {
|
|
+ if(igen == fft_input_generator_host)
|
|
+ generate_interleaved_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
|
|
+ else if(igen == fft_input_random_generator_host)
|
|
+ generate_random_interleaved_data<Tfloat>(
|
|
+ input, whole_length, whole_stride, idist, nbatch);
|
|
+
|
|
+ if(itype == fft_array_type_hermitian_interleaved)
|
|
+ impose_hermitian_symmetry_interleaved<Tfloat>(input, length, istride, idist, nbatch);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ {
|
|
+ if(igen == fft_input_generator_host)
|
|
+ generate_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
|
|
+ else if(igen == fft_input_random_generator_host)
|
|
+ generate_random_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
|
|
+
|
|
+ if(itype == fft_array_type_hermitian_planar)
|
|
+ impose_hermitian_symmetry_planar<Tfloat>(input, length, istride, idist, nbatch);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_real:
|
|
+ {
|
|
+ if(igen == fft_input_generator_host)
|
|
+ generate_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
|
|
+ else if(igen == fft_input_random_generator_host)
|
|
+ generate_random_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ throw std::runtime_error("Input layout format not yet supported");
|
|
+ }
|
|
+}
|
|
+
|
|
+// unroll set_input for dimension 1, 2, 3
|
|
+template <typename Tbuff, typename Tfloat>
|
|
+inline void set_input(std::vector<Tbuff>& input,
|
|
+ const fft_input_generator igen,
|
|
+ const fft_array_type itype,
|
|
+ const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& ilength,
|
|
+ const std::vector<size_t>& istride,
|
|
+ const size_t idist,
|
|
+ const size_t nbatch,
|
|
+ const hipDeviceProp_t& deviceProp)
|
|
+{
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ set_input<Tfloat>(input,
|
|
+ igen,
|
|
+ itype,
|
|
+ length,
|
|
+ ilength,
|
|
+ istride,
|
|
+ ilength[0],
|
|
+ istride[0],
|
|
+ idist,
|
|
+ nbatch,
|
|
+ deviceProp);
|
|
+ break;
|
|
+ case 2:
|
|
+ set_input<Tfloat>(input,
|
|
+ igen,
|
|
+ itype,
|
|
+ length,
|
|
+ ilength,
|
|
+ istride,
|
|
+ std::make_tuple(ilength[0], ilength[1]),
|
|
+ std::make_tuple(istride[0], istride[1]),
|
|
+ idist,
|
|
+ nbatch,
|
|
+ deviceProp);
|
|
+ break;
|
|
+ case 3:
|
|
+ set_input<Tfloat>(input,
|
|
+ igen,
|
|
+ itype,
|
|
+ length,
|
|
+ ilength,
|
|
+ istride,
|
|
+ std::make_tuple(ilength[0], ilength[1], ilength[2]),
|
|
+ std::make_tuple(istride[0], istride[1], istride[2]),
|
|
+ idist,
|
|
+ nbatch,
|
|
+ deviceProp);
|
|
+ break;
|
|
+ default:
|
|
+ abort();
|
|
+ }
|
|
+}
|
|
+
|
|
+// Container class for test parameters.
|
|
+class fft_params
|
|
+{
|
|
+public:
|
|
+ // All parameters are row-major.
|
|
+ std::vector<size_t> length;
|
|
+ std::vector<size_t> istride;
|
|
+ std::vector<size_t> ostride;
|
|
+ size_t nbatch = 1;
|
|
+ fft_precision precision = fft_precision_single;
|
|
+ fft_input_generator igen = fft_input_random_generator_device;
|
|
+ fft_transform_type transform_type = fft_transform_type_complex_forward;
|
|
+ fft_result_placement placement = fft_placement_inplace;
|
|
+ size_t idist = 0;
|
|
+ size_t odist = 0;
|
|
+ fft_array_type itype = fft_array_type_unset;
|
|
+ fft_array_type otype = fft_array_type_unset;
|
|
+ std::vector<size_t> ioffset = {0, 0};
|
|
+ std::vector<size_t> ooffset = {0, 0};
|
|
+
|
|
+ std::vector<size_t> isize;
|
|
+ std::vector<size_t> osize;
|
|
+
|
|
+ size_t workbuffersize = 0;
|
|
+
|
|
+ struct fft_brick
|
|
+ {
|
|
+ // all vectors here are row-major, with same length as FFT
|
|
+ // dimension + 1 (for batch dimension)
|
|
+
|
|
+ // inclusive lower bound of brick
|
|
+ std::vector<size_t> lower;
|
|
+ // exclusive upper bound of brick
|
|
+ std::vector<size_t> upper;
|
|
+ // stride of brick in memory
|
|
+ std::vector<size_t> stride;
|
|
+
|
|
+ // compute the length of this brick
|
|
+ std::vector<size_t> length() const
|
|
+ {
|
|
+ std::vector<size_t> ret;
|
|
+ for(size_t i = 0; i < lower.size(); ++i)
|
|
+ ret.push_back(upper[i] - lower[i]);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ // compute offset of lower bound in a field with the given
|
|
+ // stride + dist (batch stride is separate)
|
|
+ size_t lower_field_offset(std::vector<size_t> stride, size_t dist) const
|
|
+ {
|
|
+ // brick strides include batch, so adjust our input accordingly
|
|
+ stride.insert(stride.begin(), dist);
|
|
+
|
|
+ return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0);
|
|
+ }
|
|
+
|
|
+ // location of the brick
|
|
+ int device = 0;
|
|
+ };
|
|
+
|
|
+ struct fft_field
|
|
+ {
|
|
+ std::vector<fft_brick> bricks;
|
|
+ };
|
|
+ // optional brick decomposition of inputs/outputs
|
|
+ std::vector<fft_field> ifields;
|
|
+ std::vector<fft_field> ofields;
|
|
+
|
|
+ // run testing load/store callbacks
|
|
+ bool run_callbacks = false;
|
|
+ static constexpr double load_cb_scalar = 0.457813941;
|
|
+ static constexpr double store_cb_scalar = 0.391504938;
|
|
+
|
|
+ // Check that data outside of output strides is not overwritten.
|
|
+ // This is only set explicitly on some tests where there's space
|
|
+ // between dimensions, but the dimensions are still in-order.
|
|
+ // We're not trying to generically find holes in arbitrary data
|
|
+ // layouts.
|
|
+ //
|
|
+ // NOTE: this flag is not included in tokens, since it doesn't
|
|
+ // affect how the FFT library behaves.
|
|
+ bool check_output_strides = false;
|
|
+
|
|
+ // scaling factor - we do a pointwise multiplication of outputs by
|
|
+ // this factor
|
|
+ double scale_factor = 1.0;
|
|
+
|
|
+ fft_params(){};
|
|
+ virtual ~fft_params(){};
|
|
+
|
|
+ // Given an array type, return the name as a string.
|
|
+ static std::string array_type_name(const fft_array_type type, bool verbose = true)
|
|
+ {
|
|
+ switch(type)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ return verbose ? "fft_array_type_complex_interleaved" : "CI";
|
|
+ case fft_array_type_complex_planar:
|
|
+ return verbose ? "fft_array_type_complex_planar" : "CP";
|
|
+ case fft_array_type_real:
|
|
+ return verbose ? "fft_array_type_real" : "R";
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ return verbose ? "fft_array_type_hermitian_planar" : "HP";
|
|
+ case fft_array_type_unset:
|
|
+ return verbose ? "fft_array_type_unset" : "UN";
|
|
+ }
|
|
+ return "";
|
|
+ }
|
|
+
|
|
+ std::string transform_type_name() const
|
|
+ {
|
|
+ switch(transform_type)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ return "fft_transform_type_complex_forward";
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ return "fft_transform_type_complex_inverse";
|
|
+ case fft_transform_type_real_forward:
|
|
+ return "fft_transform_type_real_forward";
|
|
+ case fft_transform_type_real_inverse:
|
|
+ return "fft_transform_type_real_inverse";
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Convert to string for output.
|
|
+ std::string str(const std::string& separator = ", ") const
|
|
+ {
|
|
+ // top-level stride/dist are not used when fields are specified.
|
|
+ const bool have_ifields = !ifields.empty();
|
|
+ const bool have_ofields = !ofields.empty();
|
|
+
|
|
+ std::stringstream ss;
|
|
+ auto print_size_vec = [&](const char* description, const std::vector<size_t>& vec) {
|
|
+ ss << description << ":";
|
|
+ for(auto i : vec)
|
|
+ ss << " " << i;
|
|
+ ss << separator;
|
|
+ };
|
|
+ auto print_fields = [&](const char* description, const std::vector<fft_field>& fields) {
|
|
+ for(unsigned int fidx = 0; fidx < fields.size(); ++fidx)
|
|
+ {
|
|
+ const auto& f = fields[fidx];
|
|
+ ss << description << " " << fidx << ":" << separator;
|
|
+ for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx)
|
|
+ {
|
|
+ const auto& b = f.bricks[bidx];
|
|
+ ss << " brick " << bidx << ":" << separator;
|
|
+ print_size_vec(" lower", b.lower);
|
|
+ print_size_vec(" upper", b.upper);
|
|
+ print_size_vec(" stride", b.stride);
|
|
+ ss << " device: " << b.device << separator;
|
|
+ }
|
|
+ }
|
|
+ };
|
|
+
|
|
+ print_size_vec("length", length);
|
|
+ if(have_ifields)
|
|
+ {
|
|
+ print_fields("ifield", ifields);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ print_size_vec("istride", istride);
|
|
+ ss << "idist: " << idist << separator;
|
|
+ }
|
|
+
|
|
+ if(have_ofields)
|
|
+ {
|
|
+ print_fields("ofield", ofields);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ print_size_vec("ostride", ostride);
|
|
+ ss << "odist: " << odist << separator;
|
|
+ }
|
|
+
|
|
+ ss << "batch: " << nbatch << separator;
|
|
+ print_size_vec("isize", isize);
|
|
+ print_size_vec("osize", osize);
|
|
+
|
|
+ print_size_vec("ioffset", ioffset);
|
|
+ print_size_vec("ooffset", ooffset);
|
|
+
|
|
+ if(placement == fft_placement_inplace)
|
|
+ ss << "in-place";
|
|
+ else
|
|
+ ss << "out-of-place";
|
|
+ ss << separator;
|
|
+ ss << "transform_type: " << transform_type_name() << separator;
|
|
+ ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ ss << "half-precision";
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ ss << "single-precision";
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ ss << "double-precision";
|
|
+ break;
|
|
+ }
|
|
+ ss << separator;
|
|
+
|
|
+ print_size_vec("ilength", ilength());
|
|
+ print_size_vec("olength", olength());
|
|
+
|
|
+ print_size_vec("ibuffer_size", ibuffer_sizes());
|
|
+ print_size_vec("obuffer_size", obuffer_sizes());
|
|
+
|
|
+ if(scale_factor != 1.0)
|
|
+ ss << "scale factor: " << scale_factor << separator;
|
|
+
|
|
+ return ss.str();
|
|
+ }
|
|
+
|
|
+ // Produce a stringified token of the test fft params.
|
|
+ std::string token() const
|
|
+ {
|
|
+ std::string ret;
|
|
+
|
|
+ switch(transform_type)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ ret += "complex_forward_";
|
|
+ break;
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ ret += "complex_inverse_";
|
|
+ break;
|
|
+ case fft_transform_type_real_forward:
|
|
+ ret += "real_forward_";
|
|
+ break;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ ret += "real_inverse_";
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ auto append_size_vec = [&ret](const std::vector<size_t>& vec) {
|
|
+ for(auto s : vec)
|
|
+ {
|
|
+ ret += "_";
|
|
+ ret += std::to_string(s);
|
|
+ }
|
|
+ };
|
|
+
|
|
+ ret += "len";
|
|
+ append_size_vec(length);
|
|
+
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ ret += "_half_";
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ ret += "_single_";
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ ret += "_double_";
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ switch(placement)
|
|
+ {
|
|
+ case fft_placement_inplace:
|
|
+ ret += "ip_";
|
|
+ break;
|
|
+ case fft_placement_notinplace:
|
|
+ ret += "op_";
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret += "batch_";
|
|
+ ret += std::to_string(nbatch);
|
|
+
|
|
+ auto append_array_type = [&ret](fft_array_type type) {
|
|
+ switch(type)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ ret += "CI";
|
|
+ break;
|
|
+ case fft_array_type_complex_planar:
|
|
+ ret += "CP";
|
|
+ break;
|
|
+ case fft_array_type_real:
|
|
+ ret += "R";
|
|
+ break;
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ ret += "HI";
|
|
+ break;
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ ret += "HP";
|
|
+ break;
|
|
+ default:
|
|
+ ret += "UN";
|
|
+ break;
|
|
+ }
|
|
+ };
|
|
+
|
|
+ auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) {
|
|
+ ret += "_brick";
|
|
+
|
|
+ ret += "_lower";
|
|
+ append_size_vec(b.lower);
|
|
+ ret += "_upper";
|
|
+ append_size_vec(b.upper);
|
|
+ ret += "_stride";
|
|
+ append_size_vec(b.stride);
|
|
+ ret += "_dev_";
|
|
+ ret += std::to_string(b.device);
|
|
+ };
|
|
+
|
|
+ const bool have_ifields = !ifields.empty();
|
|
+ const bool have_ofields = !ofields.empty();
|
|
+
|
|
+ if(have_ifields)
|
|
+ {
|
|
+ for(const auto& f : ifields)
|
|
+ {
|
|
+ ret += "_ifield";
|
|
+ for(const auto& b : f.bricks)
|
|
+ append_brick_info(b);
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ ret += "_istride";
|
|
+ append_size_vec(istride);
|
|
+ ret += "_";
|
|
+ append_array_type(itype);
|
|
+ }
|
|
+
|
|
+ if(have_ofields)
|
|
+ {
|
|
+ for(const auto& f : ofields)
|
|
+ {
|
|
+ ret += "_ofield";
|
|
+ for(const auto& b : f.bricks)
|
|
+ append_brick_info(b);
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ ret += "_ostride";
|
|
+ append_size_vec(ostride);
|
|
+ ret += "_";
|
|
+ append_array_type(otype);
|
|
+ }
|
|
+
|
|
+ if(!have_ifields)
|
|
+ {
|
|
+ ret += "_idist_";
|
|
+ ret += std::to_string(idist);
|
|
+ }
|
|
+ if(!have_ofields)
|
|
+ {
|
|
+ ret += "_odist_";
|
|
+ ret += std::to_string(odist);
|
|
+ }
|
|
+
|
|
+ if(!have_ifields)
|
|
+ {
|
|
+ ret += "_ioffset";
|
|
+ append_size_vec(ioffset);
|
|
+ }
|
|
+
|
|
+ if(!have_ofields)
|
|
+ {
|
|
+ ret += "_ooffset";
|
|
+ append_size_vec(ooffset);
|
|
+ }
|
|
+
|
|
+ if(run_callbacks)
|
|
+ ret += "_CB";
|
|
+
|
|
+ if(scale_factor != 1.0)
|
|
+ ret += "_scale";
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ // Set all params from a stringified token.
|
|
+ void from_token(std::string token)
|
|
+ {
|
|
+ std::vector<std::string> vals;
|
|
+
|
|
+ std::string delimiter = "_";
|
|
+ {
|
|
+ size_t pos = 0;
|
|
+ while((pos = token.find(delimiter)) != std::string::npos)
|
|
+ {
|
|
+ auto val = token.substr(0, pos);
|
|
+ vals.push_back(val);
|
|
+ token.erase(0, pos + delimiter.length());
|
|
+ }
|
|
+ vals.push_back(token);
|
|
+ }
|
|
+
|
|
+ auto size_parser
|
|
+ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
|
|
+ if(vals[pos++] != token)
|
|
+ throw std::runtime_error("Unable to parse token");
|
|
+ return std::stoull(vals[pos++]);
|
|
+ };
|
|
+
|
|
+ auto vector_parser
|
|
+ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
|
|
+ if(vals[pos++] != token)
|
|
+ throw std::runtime_error("Unable to parse token");
|
|
+ std::vector<size_t> vec;
|
|
+
|
|
+ while(pos < vals.size())
|
|
+ {
|
|
+ if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
|
|
+ {
|
|
+ vec.push_back(std::stoull(vals[pos++]));
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ return vec;
|
|
+ };
|
|
+
|
|
+ auto type_parser = [](const std::string& val) {
|
|
+ if(val == "CI")
|
|
+ return fft_array_type_complex_interleaved;
|
|
+ else if(val == "CP")
|
|
+ return fft_array_type_complex_planar;
|
|
+ else if(val == "R")
|
|
+ return fft_array_type_real;
|
|
+ else if(val == "HI")
|
|
+ return fft_array_type_hermitian_interleaved;
|
|
+ else if(val == "HP")
|
|
+ return fft_array_type_hermitian_planar;
|
|
+ return fft_array_type_unset;
|
|
+ };
|
|
+
|
|
+ auto field_parser = [&vector_parser, &size_parser](const std::vector<std::string>& vals,
|
|
+ size_t& pos,
|
|
+ std::vector<fft_field>& output) {
|
|
+ // skip over ifield/ofield word
|
|
+ pos++;
|
|
+ fft_field& f = output.emplace_back();
|
|
+ while(pos < vals.size() && vals[pos] == "brick")
|
|
+ {
|
|
+ fft_brick& b = f.bricks.emplace_back();
|
|
+ pos++;
|
|
+ b.lower = vector_parser(vals, "lower", pos);
|
|
+ b.upper = vector_parser(vals, "upper", pos);
|
|
+ b.stride = vector_parser(vals, "stride", pos);
|
|
+ b.device = size_parser(vals, "dev", pos);
|
|
+ }
|
|
+ };
|
|
+
|
|
+ size_t pos = 0;
|
|
+
|
|
+ bool complex = vals[pos++] == "complex";
|
|
+ bool forward = vals[pos++] == "forward";
|
|
+
|
|
+ if(complex && forward)
|
|
+ transform_type = fft_transform_type_complex_forward;
|
|
+ if(complex && !forward)
|
|
+ transform_type = fft_transform_type_complex_inverse;
|
|
+ if(!complex && forward)
|
|
+ transform_type = fft_transform_type_real_forward;
|
|
+ if(!complex && !forward)
|
|
+ transform_type = fft_transform_type_real_inverse;
|
|
+
|
|
+ length = vector_parser(vals, "len", pos);
|
|
+
|
|
+ if(vals[pos] == "half")
|
|
+ precision = fft_precision_half;
|
|
+ else if(vals[pos] == "single")
|
|
+ precision = fft_precision_single;
|
|
+ else if(vals[pos] == "double")
|
|
+ precision = fft_precision_double;
|
|
+ pos++;
|
|
+
|
|
+ placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
|
|
+
|
|
+ nbatch = size_parser(vals, "batch", pos);
|
|
+
|
|
+ // strides, bricks etc are mixed in from here, so just keep
|
|
+ // looking at the next token to decide what to do
|
|
+ while(pos < vals.size())
|
|
+ {
|
|
+ const auto& next_token = vals[pos];
|
|
+ if(next_token == "istride")
|
|
+ {
|
|
+ istride = vector_parser(vals, "istride", pos);
|
|
+ itype = type_parser(vals[pos]);
|
|
+ pos++;
|
|
+ }
|
|
+ else if(next_token == "ostride")
|
|
+ {
|
|
+ ostride = vector_parser(vals, "ostride", pos);
|
|
+ otype = type_parser(vals[pos]);
|
|
+ pos++;
|
|
+ }
|
|
+ else if(next_token == "idist")
|
|
+ idist = size_parser(vals, "idist", pos);
|
|
+ else if(next_token == "odist")
|
|
+ odist = size_parser(vals, "odist", pos);
|
|
+ else if(next_token == "ioffset")
|
|
+ ioffset = vector_parser(vals, "ioffset", pos);
|
|
+ else if(next_token == "ooffset")
|
|
+ ooffset = vector_parser(vals, "ooffset", pos);
|
|
+ else if(next_token == "ifield")
|
|
+ field_parser(vals, pos, ifields);
|
|
+ else if(next_token == "ofield")
|
|
+ field_parser(vals, pos, ofields);
|
|
+ else
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if(pos < vals.size() && vals[pos] == "CB")
|
|
+ {
|
|
+ run_callbacks = true;
|
|
+ ++pos;
|
|
+ }
|
|
+
|
|
+ if(pos < vals.size() && vals[pos] == "scale")
|
|
+ {
|
|
+ // just pick some factor that's not zero or one
|
|
+ scale_factor = 0.1239;
|
|
+ ++pos;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Stream output operator (for gtest, etc).
|
|
+ friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
|
|
+ {
|
|
+ stream << params.str();
|
|
+ return stream;
|
|
+ }
|
|
+
|
|
+ // Dimension of the transform.
|
|
+ size_t dim() const
|
|
+ {
|
|
+ return length.size();
|
|
+ }
|
|
+
|
|
+ virtual std::vector<size_t> ilength() const
|
|
+ {
|
|
+ auto ilength = length;
|
|
+ if(transform_type == fft_transform_type_real_inverse)
|
|
+ ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
|
|
+ return ilength;
|
|
+ }
|
|
+
|
|
+ virtual std::vector<size_t> olength() const
|
|
+ {
|
|
+ auto olength = length;
|
|
+ if(transform_type == fft_transform_type_real_forward)
|
|
+ olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
|
|
+ return olength;
|
|
+ }
|
|
+
|
|
+ static size_t nbuffer(const fft_array_type type)
|
|
+ {
|
|
+ switch(type)
|
|
+ {
|
|
+ case fft_array_type_real:
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ return 1;
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ return 2;
|
|
+ case fft_array_type_unset:
|
|
+ return 0;
|
|
+ }
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ // Number of input buffers
|
|
+ size_t nibuffer() const
|
|
+ {
|
|
+ return nbuffer(itype);
|
|
+ }
|
|
+
|
|
+ // Number of output buffers
|
|
+ size_t nobuffer() const
|
|
+ {
|
|
+ return nbuffer(otype);
|
|
+ }
|
|
+
|
|
+ void set_iotypes()
|
|
+ {
|
|
+ if(itype == fft_array_type_unset)
|
|
+ {
|
|
+ switch(transform_type)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ itype = fft_array_type_complex_interleaved;
|
|
+ break;
|
|
+ case fft_transform_type_real_forward:
|
|
+ itype = fft_array_type_real;
|
|
+ break;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ itype = fft_array_type_hermitian_interleaved;
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+ }
|
|
+ if(otype == fft_array_type_unset)
|
|
+ {
|
|
+ switch(transform_type)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ otype = fft_array_type_complex_interleaved;
|
|
+ break;
|
|
+ case fft_transform_type_real_forward:
|
|
+ otype = fft_array_type_hermitian_interleaved;
|
|
+ break;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ otype = fft_array_type_real;
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Check that the input and output types are consistent.
|
|
+ bool check_iotypes() const
|
|
+ {
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ case fft_array_type_real:
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid Input array type format");
|
|
+ }
|
|
+
|
|
+ switch(otype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ case fft_array_type_real:
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid Input array type format");
|
|
+ }
|
|
+
|
|
+ // Check that format choices are supported
|
|
+ if(transform_type != fft_transform_type_real_forward
|
|
+ && transform_type != fft_transform_type_real_inverse)
|
|
+ {
|
|
+ if(placement == fft_placement_inplace && itype != otype)
|
|
+ {
|
|
+ throw std::runtime_error(
|
|
+ "In-place transforms must have identical input and output types");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bool okformat = true;
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_complex_planar:
|
|
+ okformat = (otype == fft_array_type_complex_interleaved
|
|
+ || otype == fft_array_type_complex_planar);
|
|
+ break;
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ okformat = otype == fft_array_type_real;
|
|
+ break;
|
|
+ case fft_array_type_real:
|
|
+ okformat = (otype == fft_array_type_hermitian_interleaved
|
|
+ || otype == fft_array_type_hermitian_planar);
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid Input array type format");
|
|
+ }
|
|
+
|
|
+ return okformat;
|
|
+ }
|
|
+
|
|
+ // Given a length vector, set the rest of the strides.
|
|
+ // The optional argument stride0 sets the stride for the contiguous dimension.
|
|
+ // The optional rcpadding argument sets the stride correctly for in-place
|
|
+ // multi-dimensional real/complex transforms.
|
|
+ // Format is row-major.
|
|
+ template <typename T1>
|
|
+ std::vector<T1> compute_stride(const std::vector<T1>& length,
|
|
+ const std::vector<size_t>& stride0 = std::vector<size_t>(),
|
|
+ const bool rcpadding = false) const
|
|
+ {
|
|
+ std::vector<T1> stride(dim());
|
|
+
|
|
+ size_t dimoffset = 0;
|
|
+
|
|
+ if(stride0.size() == 0)
|
|
+ {
|
|
+ // Set the contiguous stride:
|
|
+ stride[dim() - 1] = 1;
|
|
+ dimoffset = 1;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ // Copy the input values to the end of the stride array:
|
|
+ for(size_t i = 0; i < stride0.size(); ++i)
|
|
+ {
|
|
+ stride[dim() - stride0.size() + i] = stride0[i];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(stride0.size() < dim())
|
|
+ {
|
|
+ // Compute any remaining values via recursion.
|
|
+ for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
|
|
+ {
|
|
+ auto lengthip1 = length[i + 1];
|
|
+ if(rcpadding && i == dim() - 2)
|
|
+ {
|
|
+ lengthip1 = 2 * (lengthip1 / 2 + 1);
|
|
+ }
|
|
+ stride[i] = stride[i + 1] * lengthip1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return stride;
|
|
+ }
|
|
+
|
|
+ void compute_istride()
|
|
+ {
|
|
+ istride = compute_stride(ilength(),
|
|
+ istride,
|
|
+ placement == fft_placement_inplace
|
|
+ && transform_type == fft_transform_type_real_forward);
|
|
+ }
|
|
+
|
|
+ void compute_ostride()
|
|
+ {
|
|
+ ostride = compute_stride(olength(),
|
|
+ ostride,
|
|
+ placement == fft_placement_inplace
|
|
+ && transform_type == fft_transform_type_real_inverse);
|
|
+ }
|
|
+
|
|
+ virtual void compute_isize()
|
|
+ {
|
|
+ auto il = ilength();
|
|
+ size_t val = compute_ptrdiff(il, istride, nbatch, idist);
|
|
+ isize.resize(nibuffer());
|
|
+ for(unsigned int i = 0; i < isize.size(); ++i)
|
|
+ {
|
|
+ isize[i] = val + ioffset[i];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ virtual void compute_osize()
|
|
+ {
|
|
+ auto ol = olength();
|
|
+ size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
|
|
+ osize.resize(nobuffer());
|
|
+ for(unsigned int i = 0; i < osize.size(); ++i)
|
|
+ {
|
|
+ osize[i] = val + ooffset[i];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ std::vector<size_t> ibuffer_sizes() const
|
|
+ {
|
|
+ std::vector<size_t> ibuffer_sizes;
|
|
+
|
|
+ // In-place real-to-complex transforms need to have enough space in the input buffer to
|
|
+ // accomadate the output, which is slightly larger.
|
|
+ if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
|
|
+ {
|
|
+ return obuffer_sizes();
|
|
+ }
|
|
+
|
|
+ if(isize.empty())
|
|
+ return ibuffer_sizes;
|
|
+
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ ibuffer_sizes.resize(2);
|
|
+ break;
|
|
+ default:
|
|
+ ibuffer_sizes.resize(1);
|
|
+ }
|
|
+ for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
|
|
+ {
|
|
+ ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
|
|
+ }
|
|
+ return ibuffer_sizes;
|
|
+ }
|
|
+
|
|
+ virtual std::vector<size_t> obuffer_sizes() const
|
|
+ {
|
|
+ std::vector<size_t> obuffer_sizes;
|
|
+
|
|
+ if(osize.empty())
|
|
+ return obuffer_sizes;
|
|
+
|
|
+ switch(otype)
|
|
+ {
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ obuffer_sizes.resize(2);
|
|
+ break;
|
|
+ default:
|
|
+ obuffer_sizes.resize(1);
|
|
+ }
|
|
+ for(unsigned i = 0; i < obuffer_sizes.size(); i++)
|
|
+ {
|
|
+ obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
|
|
+ }
|
|
+ return obuffer_sizes;
|
|
+ }
|
|
+
|
|
+ // Compute the idist for a given transform based on the placeness, transform type, and data
|
|
+ // layout.
|
|
+ size_t compute_idist() const
|
|
+ {
|
|
+ size_t dist = 0;
|
|
+ // In-place 1D transforms need extra dist.
|
|
+ if(transform_type == fft_transform_type_real_forward && dim() == 1
|
|
+ && placement == fft_placement_inplace)
|
|
+ {
|
|
+ dist = 2 * (length[0] / 2 + 1) * istride[0];
|
|
+ return dist;
|
|
+ }
|
|
+
|
|
+ if(transform_type == fft_transform_type_real_inverse && dim() == 1)
|
|
+ {
|
|
+ dist = (length[0] / 2 + 1) * istride[0];
|
|
+ return dist;
|
|
+ }
|
|
+
|
|
+ dist = (transform_type == fft_transform_type_real_inverse)
|
|
+ ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
|
|
+ : length[dim() - 1] * istride[dim() - 1];
|
|
+ for(unsigned int i = 0; i < dim() - 1; ++i)
|
|
+ {
|
|
+ dist = std::max(length[i] * istride[i], dist);
|
|
+ }
|
|
+ return dist;
|
|
+ }
|
|
+ void set_idist()
|
|
+ {
|
|
+ if(idist != 0)
|
|
+ return;
|
|
+ idist = compute_idist();
|
|
+ }
|
|
+
|
|
+ // Compute the odist for a given transform based on the placeness, transform type, and data
|
|
+ // layout. Row-major.
|
|
+ size_t compute_odist() const
|
|
+ {
|
|
+ size_t dist = 0;
|
|
+ // In-place 1D transforms need extra dist.
|
|
+ if(transform_type == fft_transform_type_real_inverse && dim() == 1
|
|
+ && placement == fft_placement_inplace)
|
|
+ {
|
|
+ dist = 2 * (length[0] / 2 + 1) * ostride[0];
|
|
+ return dist;
|
|
+ }
|
|
+
|
|
+ if(transform_type == fft_transform_type_real_forward && dim() == 1)
|
|
+ {
|
|
+ dist = (length[0] / 2 + 1) * ostride[0];
|
|
+ return dist;
|
|
+ }
|
|
+
|
|
+ dist = (transform_type == fft_transform_type_real_forward)
|
|
+ ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
|
|
+ : length[dim() - 1] * ostride[dim() - 1];
|
|
+ for(unsigned int i = 0; i < dim() - 1; ++i)
|
|
+ {
|
|
+ dist = std::max(length[i] * ostride[i], dist);
|
|
+ }
|
|
+ return dist;
|
|
+ }
|
|
+ void set_odist()
|
|
+ {
|
|
+ if(odist != 0)
|
|
+ return;
|
|
+ odist = compute_odist();
|
|
+ }
|
|
+
|
|
+ // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
|
|
+ // validity checker.
|
|
+ bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
|
|
+ const std::vector<size_t>& s0,
|
|
+ const size_t n,
|
|
+ const size_t dist,
|
|
+ const int verbose = 0) const
|
|
+ {
|
|
+ if(l0.size() != s0.size())
|
|
+ return false;
|
|
+
|
|
+ // Length and stride vectors, including bathes:
|
|
+ std::vector<size_t> l{}, s{};
|
|
+ for(unsigned int i = 0; i < l0.size(); ++i)
|
|
+ {
|
|
+ if(l0[i] > 1)
|
|
+ {
|
|
+ if(s0[i] == 0)
|
|
+ return false;
|
|
+ l.push_back(l0[i]);
|
|
+ s.push_back(s0[i]);
|
|
+ }
|
|
+ }
|
|
+ if(n > 1)
|
|
+ {
|
|
+ if(dist == 0)
|
|
+ return false;
|
|
+ l.push_back(n);
|
|
+ s.push_back(dist);
|
|
+ }
|
|
+
|
|
+ return array_valid(l, s, verbose);
|
|
+ }
|
|
+
|
|
+ // Return true if the given GPU parameters would produce a valid transform.
|
|
+ bool valid(const int verbose) const
|
|
+ {
|
|
+ if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
|
|
+ return false;
|
|
+
|
|
+ // Check that in-place transforms have the same input and output stride:
|
|
+ if(placement == fft_placement_inplace)
|
|
+ {
|
|
+ const auto stridesize = std::min(istride.size(), ostride.size());
|
|
+ bool samestride = true;
|
|
+ for(unsigned int i = 0; i < stridesize; ++i)
|
|
+ {
|
|
+ if(istride[i] != ostride[i])
|
|
+ samestride = false;
|
|
+ }
|
|
+ if((transform_type == fft_transform_type_complex_forward
|
|
+ || transform_type == fft_transform_type_complex_inverse)
|
|
+ && !samestride)
|
|
+ {
|
|
+ // In-place transforms require identical input and output strides.
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "istride:";
|
|
+ for(const auto& i : istride)
|
|
+ std::cout << " " << i;
|
|
+ std::cout << " ostride0:";
|
|
+ for(const auto& i : ostride)
|
|
+ std::cout << " " << i;
|
|
+ std::cout << " differ; skipped for in-place transforms: skipping test"
|
|
+ << std::endl;
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if((transform_type == fft_transform_type_complex_forward
|
|
+ || transform_type == fft_transform_type_complex_inverse)
|
|
+ && (idist != odist) && nbatch > 1)
|
|
+ {
|
|
+ // In-place transforms require identical distance, if
|
|
+ // batch > 1. If batch is 1 then dist is ignored and
|
|
+ // the FFT should still work.
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "idist:" << idist << " odist:" << odist
|
|
+ << " differ; skipped for in-place transforms: skipping test"
|
|
+ << std::endl;
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if((transform_type == fft_transform_type_real_forward
|
|
+ || transform_type == fft_transform_type_real_inverse)
|
|
+ && (istride.back() != 1 || ostride.back() != 1))
|
|
+ {
|
|
+ // In-place real/complex transforms require unit strides.
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout
|
|
+ << "istride.back(): " << istride.back()
|
|
+ << " ostride.back(): " << ostride.back()
|
|
+ << " must be unitary for in-place real/complex transforms: skipping test"
|
|
+ << std::endl;
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if((itype == fft_array_type_complex_interleaved
|
|
+ && otype == fft_array_type_complex_planar)
|
|
+ || (itype == fft_array_type_complex_planar
|
|
+ && otype == fft_array_type_complex_interleaved))
|
|
+ {
|
|
+ if(verbose)
|
|
+ {
|
|
+ std::cout << "In-place c2c transforms require identical io types; skipped.\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Check offsets
|
|
+ switch(transform_type)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ for(unsigned int i = 0; i < nibuffer(); ++i)
|
|
+ {
|
|
+ if(ioffset[i] != ooffset[i])
|
|
+ return false;
|
|
+ }
|
|
+ break;
|
|
+ case fft_transform_type_real_forward:
|
|
+ if(ioffset[0] != 2 * ooffset[0])
|
|
+ return false;
|
|
+ break;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ if(2 * ioffset[0] != ooffset[0])
|
|
+ return false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(!check_iotypes())
|
|
+ return false;
|
|
+
|
|
+ // we can only check output strides on out-of-place
|
|
+ // transforms, since we need to initialize output to a known
|
|
+ // pattern
|
|
+ if(placement == fft_placement_inplace && check_output_strides)
|
|
+ return false;
|
|
+
|
|
+ // Check input and output strides
|
|
+ if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
|
|
+ {
|
|
+ if(verbose)
|
|
+ std::cout << "Invalid input data format.\n";
|
|
+ return false;
|
|
+ }
|
|
+ if(!(ilength() == olength() && istride == ostride && idist == odist))
|
|
+ {
|
|
+ // Only check if different
|
|
+ if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
|
|
+ {
|
|
+ if(verbose)
|
|
+ std::cout << "Invalid output data format.\n";
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // The parameters are valid.
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ // Fill in any missing parameters.
|
|
+ void validate()
|
|
+ {
|
|
+ set_iotypes();
|
|
+ compute_istride();
|
|
+ compute_ostride();
|
|
+ set_idist();
|
|
+ set_odist();
|
|
+ compute_isize();
|
|
+ compute_osize();
|
|
+
|
|
+ validate_fields();
|
|
+ }
|
|
+
|
|
+ virtual void validate_fields() const
|
|
+ {
|
|
+ if(!ifields.empty() || !ofields.empty())
|
|
+ throw std::runtime_error("input/output fields are unsupported");
|
|
+ }
|
|
+
|
|
+ // Column-major getters:
|
|
+ std::vector<size_t> length_cm() const
|
|
+ {
|
|
+ auto length_cm = length;
|
|
+ std::reverse(std::begin(length_cm), std::end(length_cm));
|
|
+ return length_cm;
|
|
+ }
|
|
+ std::vector<size_t> ilength_cm() const
|
|
+ {
|
|
+ auto ilength_cm = ilength();
|
|
+ std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
|
|
+ return ilength_cm;
|
|
+ }
|
|
+ std::vector<size_t> olength_cm() const
|
|
+ {
|
|
+ auto olength_cm = olength();
|
|
+ std::reverse(std::begin(olength_cm), std::end(olength_cm));
|
|
+ return olength_cm;
|
|
+ }
|
|
+ std::vector<size_t> istride_cm() const
|
|
+ {
|
|
+ auto istride_cm = istride;
|
|
+ std::reverse(std::begin(istride_cm), std::end(istride_cm));
|
|
+ return istride_cm;
|
|
+ }
|
|
+ std::vector<size_t> ostride_cm() const
|
|
+ {
|
|
+ auto ostride_cm = ostride;
|
|
+ std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
|
|
+ return ostride_cm;
|
|
+ }
|
|
+ bool is_planar() const
|
|
+ {
|
|
+ if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar)
|
|
+ return true;
|
|
+ if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar)
|
|
+ return true;
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary.
|
|
+ template <typename Tbuff>
|
|
+ inline void compute_input(std::vector<Tbuff>& input)
|
|
+ {
|
|
+ auto deviceProp = get_curr_device_prop();
|
|
+
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ set_input<Tbuff, _Float16>(
|
|
+ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ set_input<Tbuff, double>(
|
|
+ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ set_input<Tbuff, float>(
|
|
+ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ template <typename Tstream = std::ostream>
|
|
+ void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
|
|
+ {
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<_Float16>> s;
|
|
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<float>> s;
|
|
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<double>> s;
|
|
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ case fft_array_type_real:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<_Float16> s;
|
|
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<float> s;
|
|
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ {
|
|
+ buffer_printer<double> s;
|
|
+ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid itype in print_ibuffer");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ template <typename Tstream = std::ostream>
|
|
+ void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
|
|
+ {
|
|
+ switch(otype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<_Float16>> s;
|
|
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<float>> s;
|
|
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ buffer_printer<rocfft_complex<double>> s;
|
|
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ case fft_array_type_real:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<_Float16> s;
|
|
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<float> s;
|
|
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ {
|
|
+ buffer_printer<double> s;
|
|
+ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid itype in print_obuffer");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ void print_ibuffer_flat(const std::vector<hostbuf>& buf) const
|
|
+ {
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<_Float16>> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<float>> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ buffer_printer<rocfft_complex<double>> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ case fft_array_type_real:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<_Float16> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<float> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ {
|
|
+ buffer_printer<double> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid itype in print_ibuffer_flat");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ void print_obuffer_flat(const std::vector<hostbuf>& buf) const
|
|
+ {
|
|
+ switch(otype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<_Float16>> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<rocfft_complex<float>> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_double:
|
|
+ buffer_printer<rocfft_complex<double>> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ case fft_array_type_real:
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ {
|
|
+ buffer_printer<_Float16> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ case fft_precision_single:
|
|
+ {
|
|
+ buffer_printer<float> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case fft_precision_double:
|
|
+ {
|
|
+ buffer_printer<double> s;
|
|
+ s.print_buffer_flat(buf, osize, ooffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid itype in print_ibuffer_flat");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ virtual fft_status set_callbacks(void* load_cb_host,
|
|
+ void* load_cb_data,
|
|
+ void* store_cb_host,
|
|
+ void* store_cb_data)
|
|
+ {
|
|
+ return fft_status_success;
|
|
+ }
|
|
+
|
|
+ virtual fft_status execute(void** in, void** out)
|
|
+ {
|
|
+ return fft_status_success;
|
|
+ };
|
|
+
|
|
+ size_t fft_params_vram_footprint()
|
|
+ {
|
|
+ return fft_params::vram_footprint();
|
|
+ }
|
|
+
|
|
+ virtual size_t vram_footprint()
|
|
+ {
|
|
+ const auto ibuf_size = ibuffer_sizes();
|
|
+ size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
|
|
+ if(placement == fft_placement_notinplace)
|
|
+ {
|
|
+ const auto obuf_size = obuffer_sizes();
|
|
+ val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
|
|
+ }
|
|
+ return val;
|
|
+ }
|
|
+
|
|
+ // Specific exception type for work buffer allocation failure.
|
|
+ // Tests that hit this can't fit on the GPU and should be skipped.
|
|
+ struct work_buffer_alloc_failure : public std::runtime_error
|
|
+ {
|
|
+ work_buffer_alloc_failure(const std::string& s)
|
|
+ : std::runtime_error(s)
|
|
+ {
|
|
+ }
|
|
+ };
|
|
+
|
|
+ virtual fft_status create_plan()
|
|
+ {
|
|
+ return fft_status_success;
|
|
+ }
|
|
+
|
|
+ // Change a forward transform to it's inverse
|
|
+ void inverse_from_forward(fft_params& params_forward)
|
|
+ {
|
|
+ switch(params_forward.transform_type)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ transform_type = fft_transform_type_complex_inverse;
|
|
+ break;
|
|
+ case fft_transform_type_real_forward:
|
|
+ transform_type = fft_transform_type_real_inverse;
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Transform type not forward.");
|
|
+ }
|
|
+
|
|
+ length = params_forward.length;
|
|
+ istride = params_forward.ostride;
|
|
+ ostride = params_forward.istride;
|
|
+ nbatch = params_forward.nbatch;
|
|
+ precision = params_forward.precision;
|
|
+ placement = params_forward.placement;
|
|
+ idist = params_forward.odist;
|
|
+ odist = params_forward.idist;
|
|
+ itype = params_forward.otype;
|
|
+ otype = params_forward.itype;
|
|
+ ioffset = params_forward.ooffset;
|
|
+ ooffset = params_forward.ioffset;
|
|
+
|
|
+ run_callbacks = params_forward.run_callbacks;
|
|
+
|
|
+ check_output_strides = params_forward.check_output_strides;
|
|
+
|
|
+ scale_factor = 1 / params_forward.scale_factor;
|
|
+ }
|
|
+
|
|
+ // prepare for multi-GPU transform. Generated input is in ibuffer.
|
|
+ // pibuffer, pobuffer are the pointers that will be passed to the
|
|
+ // FFT library's "execute" API.
|
|
+ virtual void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
|
|
+ std::vector<void*>& pibuffer,
|
|
+ std::vector<void*>& pobuffer)
|
|
+ {
|
|
+ }
|
|
+
|
|
+ // finalize multi-GPU transform. pobuffers are the pointers
|
|
+ // provided to the FFT library's "execute" API. obuffer is the
|
|
+ // buffer where transform output needs to go for validation
|
|
+ virtual void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) {}
|
|
+
|
|
+ // create bricks in the specified field for the specified number
|
|
+ // of devices. The field is split along the highest FFT
|
|
+ // dimension, and the length only includes FFT lengths, not batch
|
|
+ // dimension.
|
|
+ void distribute_field(int deviceCount,
|
|
+ std::vector<fft_field>& fields,
|
|
+ const std::vector<size_t>& field_length)
|
|
+ {
|
|
+ size_t slowLen = field_length.front();
|
|
+ if(slowLen < static_cast<size_t>(deviceCount))
|
|
+ throw std::runtime_error("too many devices to distribute length "
|
|
+ + std::to_string(slowLen));
|
|
+
|
|
+ auto& field = fields.emplace_back();
|
|
+
|
|
+ for(int i = 0; i < deviceCount; ++i)
|
|
+ {
|
|
+ // start at origin
|
|
+ std::vector<size_t> field_lower(field_length.size());
|
|
+ std::vector<size_t> field_upper(field_length.size());
|
|
+
|
|
+ // note: slowest FFT dim is index 0 in these coordinates
|
|
+ field_lower[0] = slowLen / deviceCount * i;
|
|
+
|
|
+ // last brick needs to include the whole slow len
|
|
+ if(i == deviceCount - 1)
|
|
+ {
|
|
+ field_upper[0] = slowLen;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount);
|
|
+ }
|
|
+
|
|
+ for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim)
|
|
+ {
|
|
+ field_upper[upperDim] = field_length[upperDim];
|
|
+ }
|
|
+
|
|
+ // field coordinates also need to include batch
|
|
+ field_lower.insert(field_lower.begin(), 0);
|
|
+ field_upper.insert(field_upper.begin(), nbatch);
|
|
+
|
|
+ // bricks have contiguous strides
|
|
+ size_t brick_dist = 1;
|
|
+ std::vector<size_t> brick_stride(field_lower.size());
|
|
+ for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx)
|
|
+ {
|
|
+ // fill strides from fastest to slowest
|
|
+ *(brick_stride.rbegin() + distIdx) = brick_dist;
|
|
+ brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx);
|
|
+ }
|
|
+ field.bricks.push_back(
|
|
+ fft_params::fft_brick{field_lower, field_upper, brick_stride, i});
|
|
+ }
|
|
+ }
|
|
+
|
|
+ void distribute_input(int deviceCount)
|
|
+ {
|
|
+ distribute_field(deviceCount, ifields, length);
|
|
+ }
|
|
+
|
|
+ void distribute_output(int deviceCount)
|
|
+ {
|
|
+ distribute_field(deviceCount, ofields, olength());
|
|
+ }
|
|
+};
|
|
+
|
|
+// This is used with the program_options class so that the user can type an integer on the
|
|
+// command line and we store into an enum varaible
|
|
+template <typename _Elem, typename _Traits>
|
|
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
|
|
+ fft_array_type& atype)
|
|
+{
|
|
+ unsigned tmp;
|
|
+ stream >> tmp;
|
|
+ atype = fft_array_type(tmp);
|
|
+ return stream;
|
|
+}
|
|
+
|
|
+// similarly for transform type
|
|
+template <typename _Elem, typename _Traits>
|
|
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
|
|
+ fft_transform_type& ttype)
|
|
+{
|
|
+ unsigned tmp;
|
|
+ stream >> tmp;
|
|
+ ttype = fft_transform_type(tmp);
|
|
+ return stream;
|
|
+}
|
|
+
|
|
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
|
|
+template <typename T1>
|
|
+std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
|
|
+{
|
|
+ return partition_base(length, compute_partition_count(length));
|
|
+}
|
|
+
|
|
+// Partition on the rightmost part of the tuple, for col-major indexing
|
|
+template <typename T1>
|
|
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
|
|
+ partition_colmajor(const std::tuple<T1, T1>& length)
|
|
+{
|
|
+ auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
|
|
+ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
|
|
+ for(size_t i = 0; i < partitions.size(); ++i)
|
|
+ {
|
|
+ std::get<1>(ret[i].first) = partitions[i].first;
|
|
+ std::get<0>(ret[i].first) = 0;
|
|
+ std::get<1>(ret[i].second) = partitions[i].second;
|
|
+ std::get<0>(ret[i].second) = std::get<0>(length);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+template <typename T1>
|
|
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
|
|
+ partition_colmajor(const std::tuple<T1, T1, T1>& length)
|
|
+{
|
|
+ auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
|
|
+ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
|
|
+ for(size_t i = 0; i < partitions.size(); ++i)
|
|
+ {
|
|
+ std::get<2>(ret[i].first) = partitions[i].first;
|
|
+ std::get<1>(ret[i].first) = 0;
|
|
+ std::get<0>(ret[i].first) = 0;
|
|
+ std::get<2>(ret[i].second) = partitions[i].second;
|
|
+ std::get<1>(ret[i].second) = std::get<1>(length);
|
|
+ std::get<0>(ret[i].second) = std::get<0>(length);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+// Copy data of dimensions length with strides istride and length idist between batches to
|
|
+// a buffer with strides ostride and length odist between batches. The input and output
|
|
+// types are identical.
|
|
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
|
|
+inline void copy_buffers_1to1(const Tval* input,
|
|
+ Tval* output,
|
|
+ const Tint1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset)
|
|
+{
|
|
+ const bool idx_equals_odx = istride == ostride && idist == odist;
|
|
+ size_t idx_base = 0;
|
|
+ size_t odx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
|
|
+ output[odx + ooffset[0]] = input[idx + ioffset[0]];
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+// Copy data of dimensions length with strides istride and length idist between batches to
|
|
+// a buffer with strides ostride and length odist between batches. The input type is
|
|
+// planar and the output type is complex interleaved.
|
|
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
|
|
+inline void copy_buffers_2to1(const Tval* input0,
|
|
+ const Tval* input1,
|
|
+ rocfft_complex<Tval>* output,
|
|
+ const Tint1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset)
|
|
+{
|
|
+ const bool idx_equals_odx = istride == ostride && idist == odist;
|
|
+ size_t idx_base = 0;
|
|
+ size_t odx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
|
|
+ output[odx + ooffset[0]]
|
|
+ = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+// Copy data of dimensions length with strides istride and length idist between batches to
|
|
+// a buffer with strides ostride and length odist between batches. The input type is
|
|
+// complex interleaved and the output type is planar.
|
|
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
|
|
+inline void copy_buffers_1to2(const rocfft_complex<Tval>* input,
|
|
+ Tval* output0,
|
|
+ Tval* output1,
|
|
+ const Tint1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset)
|
|
+{
|
|
+ const bool idx_equals_odx = istride == ostride && idist == odist;
|
|
+ size_t idx_base = 0;
|
|
+ size_t odx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for num_threads(partitions.size())
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
|
|
+ output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
|
|
+ output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+// Copy data of dimensions length with strides istride and length idist between batches to
|
|
+// a buffer with strides ostride and length odist between batches. The input type given
|
|
+// by itype, and the output type is given by otype.
|
|
+template <typename Tint1, typename Tint2, typename Tint3>
|
|
+inline void copy_buffers(const std::vector<hostbuf>& input,
|
|
+ std::vector<hostbuf>& output,
|
|
+ const Tint1& length,
|
|
+ const size_t nbatch,
|
|
+ const fft_precision precision,
|
|
+ const fft_array_type itype,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const fft_array_type otype,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset)
|
|
+{
|
|
+ if(itype == otype)
|
|
+ {
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ copy_buffers_1to1(
|
|
+ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
|
|
+ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
|
|
+ reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
|
|
+ reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ case fft_array_type_real:
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ for(unsigned int idx = 0; idx < input.size(); ++idx)
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()),
|
|
+ reinterpret_cast<_Float16*>(output[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
|
|
+ reinterpret_cast<float*>(output[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
|
|
+ reinterpret_cast<double*>(output[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid data type");
|
|
+ }
|
|
+ }
|
|
+ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
|
|
+ || (itype == fft_array_type_hermitian_interleaved
|
|
+ && otype == fft_array_type_hermitian_planar))
|
|
+ {
|
|
+ // copy 1to2
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
|
|
+ reinterpret_cast<_Float16*>(output[0].data()),
|
|
+ reinterpret_cast<_Float16*>(output[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
|
|
+ reinterpret_cast<float*>(output[0].data()),
|
|
+ reinterpret_cast<float*>(output[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
|
|
+ reinterpret_cast<double*>(output[0].data()),
|
|
+ reinterpret_cast<double*>(output[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
|
|
+ || (itype == fft_array_type_hermitian_planar
|
|
+ && otype == fft_array_type_hermitian_interleaved))
|
|
+ {
|
|
+ // copy 2 to 1
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()),
|
|
+ reinterpret_cast<const _Float16*>(input[1].data()),
|
|
+ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
|
|
+ reinterpret_cast<const float*>(input[1].data()),
|
|
+ reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
|
|
+ reinterpret_cast<const double*>(input[1].data()),
|
|
+ reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ throw std::runtime_error("Invalid input and output types.");
|
|
+ }
|
|
+}
|
|
+
|
|
+// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
|
|
+template <typename Tint1, typename Tint2, typename Tint3>
|
|
+inline void copy_buffers(const std::vector<hostbuf>& input,
|
|
+ std::vector<hostbuf>& output,
|
|
+ const std::vector<Tint1>& length,
|
|
+ const size_t nbatch,
|
|
+ const fft_precision precision,
|
|
+ const fft_array_type itype,
|
|
+ const std::vector<Tint2>& istride,
|
|
+ const size_t idist,
|
|
+ const fft_array_type otype,
|
|
+ const std::vector<Tint3>& ostride,
|
|
+ const size_t odist,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset)
|
|
+{
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ return copy_buffers(input,
|
|
+ output,
|
|
+ length[0],
|
|
+ nbatch,
|
|
+ precision,
|
|
+ itype,
|
|
+ istride[0],
|
|
+ idist,
|
|
+ otype,
|
|
+ ostride[0],
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ case 2:
|
|
+ return copy_buffers(input,
|
|
+ output,
|
|
+ std::make_tuple(length[0], length[1]),
|
|
+ nbatch,
|
|
+ precision,
|
|
+ itype,
|
|
+ std::make_tuple(istride[0], istride[1]),
|
|
+ idist,
|
|
+ otype,
|
|
+ std::make_tuple(ostride[0], ostride[1]),
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ case 3:
|
|
+ return copy_buffers(input,
|
|
+ output,
|
|
+ std::make_tuple(length[0], length[1], length[2]),
|
|
+ nbatch,
|
|
+ precision,
|
|
+ itype,
|
|
+ std::make_tuple(istride[0], istride[1], istride[2]),
|
|
+ idist,
|
|
+ otype,
|
|
+ std::make_tuple(ostride[0], ostride[1], ostride[2]),
|
|
+ odist,
|
|
+ ioffset,
|
|
+ ooffset);
|
|
+ default:
|
|
+ abort();
|
|
+ }
|
|
+}
|
|
+
|
|
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
|
|
+// length idist between batches to a buffer with strides ostride and length odist between
|
|
+// batches. Both buffers are of complex type.
|
|
+
|
|
+struct VectorNorms
|
|
+{
|
|
+ double l_2 = 0.0, l_inf = 0.0;
|
|
+};
|
|
+
|
|
+template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
|
|
+inline VectorNorms distance_1to1_complex(const Tcomplex* input,
|
|
+ const Tcomplex* output,
|
|
+ const Tint1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
|
|
+ const double linf_cutoff,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset,
|
|
+ const double output_scalar = 1.0)
|
|
+{
|
|
+ double linf = 0.0;
|
|
+ double l2 = 0.0;
|
|
+
|
|
+ std::mutex linf_failure_lock;
|
|
+ std::vector<std::pair<size_t, size_t>> linf_failures_private;
|
|
+
|
|
+ const bool idx_equals_odx = istride == ostride && idist == odist;
|
|
+ size_t idx_base = 0;
|
|
+ size_t odx_base = 0;
|
|
+ auto partitions = partition_colmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ double cur_linf = 0.0;
|
|
+ double cur_l2 = 0.0;
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
|
|
+ const double rdiff
|
|
+ = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar
|
|
+ - static_cast<double>(input[idx + ioffset[0]].real()));
|
|
+ cur_linf = std::max(rdiff, cur_linf);
|
|
+ if(cur_linf > linf_cutoff)
|
|
+ {
|
|
+ std::pair<size_t, size_t> fval(b, idx);
|
|
+ if(linf_failures)
|
|
+ linf_failures_private.push_back(fval);
|
|
+ }
|
|
+ cur_l2 += rdiff * rdiff;
|
|
+
|
|
+ const double idiff
|
|
+ = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar
|
|
+ - static_cast<double>(input[idx + ioffset[0]].imag()));
|
|
+ cur_linf = std::max(idiff, cur_linf);
|
|
+ if(cur_linf > linf_cutoff)
|
|
+ {
|
|
+ std::pair<size_t, size_t> fval(b, idx);
|
|
+ if(linf_failures)
|
|
+ linf_failures_private.push_back(fval);
|
|
+ }
|
|
+ cur_l2 += idiff * idiff;
|
|
+
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ linf = std::max(linf, cur_linf);
|
|
+ l2 += cur_l2;
|
|
+
|
|
+ if(linf_failures)
|
|
+ {
|
|
+ linf_failure_lock.lock();
|
|
+ std::copy(linf_failures_private.begin(),
|
|
+ linf_failures_private.end(),
|
|
+ std::back_inserter(*linf_failures));
|
|
+ linf_failure_lock.unlock();
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return {.l_2 = sqrt(l2), .l_inf = linf};
|
|
+}
|
|
+
|
|
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
|
|
+// length idist between batches to a buffer with strides ostride and length odist between
|
|
+// batches. Both buffers are of real type.
|
|
+template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
|
|
+inline VectorNorms distance_1to1_real(const Tfloat* input,
|
|
+ const Tfloat* output,
|
|
+ const Tint1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
|
|
+ const double linf_cutoff,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset,
|
|
+ const double output_scalar = 1.0)
|
|
+{
|
|
+ double linf = 0.0;
|
|
+ double l2 = 0.0;
|
|
+
|
|
+ std::mutex linf_failure_lock;
|
|
+ std::vector<std::pair<size_t, size_t>> linf_failures_private;
|
|
+
|
|
+ const bool idx_equals_odx = istride == ostride && idist == odist;
|
|
+ size_t idx_base = 0;
|
|
+ size_t odx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ double cur_linf = 0.0;
|
|
+ double cur_l2 = 0.0;
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
|
|
+ const double diff
|
|
+ = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar
|
|
+ - static_cast<double>(input[idx + ioffset[0]]));
|
|
+ cur_linf = std::max(diff, cur_linf);
|
|
+ if(cur_linf > linf_cutoff)
|
|
+ {
|
|
+ std::pair<size_t, size_t> fval(b, idx);
|
|
+ if(linf_failures)
|
|
+ linf_failures_private.push_back(fval);
|
|
+ }
|
|
+ cur_l2 += diff * diff;
|
|
+
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ linf = std::max(linf, cur_linf);
|
|
+ l2 += cur_l2;
|
|
+
|
|
+ if(linf_failures)
|
|
+ {
|
|
+ linf_failure_lock.lock();
|
|
+ std::copy(linf_failures_private.begin(),
|
|
+ linf_failures_private.end(),
|
|
+ std::back_inserter(*linf_failures));
|
|
+ linf_failure_lock.unlock();
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return {.l_2 = sqrt(l2), .l_inf = linf};
|
|
+}
|
|
+
|
|
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
|
|
+// length idist between batches to a buffer with strides ostride and length odist between
|
|
+// batches. input is complex-interleaved, output is complex-planar.
|
|
+template <typename Tval, typename Tint1, typename T2, typename T3>
|
|
+inline VectorNorms distance_1to2(const rocfft_complex<Tval>* input,
|
|
+ const Tval* output0,
|
|
+ const Tval* output1,
|
|
+ const Tint1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const T2& istride,
|
|
+ const size_t idist,
|
|
+ const T3& ostride,
|
|
+ const size_t odist,
|
|
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
|
|
+ const double linf_cutoff,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset,
|
|
+ const double output_scalar = 1.0)
|
|
+{
|
|
+ double linf = 0.0;
|
|
+ double l2 = 0.0;
|
|
+
|
|
+ std::mutex linf_failure_lock;
|
|
+ std::vector<std::pair<size_t, size_t>> linf_failures_private;
|
|
+
|
|
+ const bool idx_equals_odx = istride == ostride && idist == odist;
|
|
+ size_t idx_base = 0;
|
|
+ size_t odx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ double cur_linf = 0.0;
|
|
+ double cur_l2 = 0.0;
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
|
|
+ const double rdiff
|
|
+ = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar
|
|
+ - static_cast<double>(input[idx + ioffset[0]].real()));
|
|
+ cur_linf = std::max(rdiff, cur_linf);
|
|
+ if(cur_linf > linf_cutoff)
|
|
+ {
|
|
+ std::pair<size_t, size_t> fval(b, idx);
|
|
+ if(linf_failures)
|
|
+ linf_failures_private.push_back(fval);
|
|
+ }
|
|
+ cur_l2 += rdiff * rdiff;
|
|
+
|
|
+ const double idiff
|
|
+ = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar
|
|
+ - static_cast<double>(input[idx + ioffset[0]].imag()));
|
|
+ cur_linf = std::max(idiff, cur_linf);
|
|
+ if(cur_linf > linf_cutoff)
|
|
+ {
|
|
+ std::pair<size_t, size_t> fval(b, idx);
|
|
+ if(linf_failures)
|
|
+ linf_failures_private.push_back(fval);
|
|
+ }
|
|
+ cur_l2 += idiff * idiff;
|
|
+
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ linf = std::max(linf, cur_linf);
|
|
+ l2 += cur_l2;
|
|
+
|
|
+ if(linf_failures)
|
|
+ {
|
|
+ linf_failure_lock.lock();
|
|
+ std::copy(linf_failures_private.begin(),
|
|
+ linf_failures_private.end(),
|
|
+ std::back_inserter(*linf_failures));
|
|
+ linf_failure_lock.unlock();
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return {.l_2 = sqrt(l2), .l_inf = linf};
|
|
+}
|
|
+
|
|
+// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
|
|
+// with types given by itype, otype, and precision.
|
|
+template <typename Tint1, typename Tint2, typename Tint3>
|
|
+inline VectorNorms distance(const std::vector<hostbuf>& input,
|
|
+ const std::vector<hostbuf>& output,
|
|
+ const Tint1& length,
|
|
+ const size_t nbatch,
|
|
+ const fft_precision precision,
|
|
+ const fft_array_type itype,
|
|
+ const Tint2& istride,
|
|
+ const size_t idist,
|
|
+ const fft_array_type otype,
|
|
+ const Tint3& ostride,
|
|
+ const size_t odist,
|
|
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
|
|
+ const double linf_cutoff,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset,
|
|
+ const double output_scalar = 1.0)
|
|
+{
|
|
+ VectorNorms dist;
|
|
+
|
|
+ if(itype == otype)
|
|
+ {
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ dist = distance_1to1_complex(
|
|
+ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
|
|
+ reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ dist = distance_1to1_complex(
|
|
+ reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
|
|
+ reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ dist = distance_1to1_complex(
|
|
+ reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
|
|
+ reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ }
|
|
+ dist.l_2 *= dist.l_2;
|
|
+ break;
|
|
+ case fft_array_type_real:
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ for(unsigned int idx = 0; idx < input.size(); ++idx)
|
|
+ {
|
|
+ VectorNorms d;
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()),
|
|
+ reinterpret_cast<const _Float16*>(output[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
|
|
+ reinterpret_cast<const float*>(output[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
|
|
+ reinterpret_cast<const double*>(output[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ }
|
|
+ dist.l_inf = std::max(d.l_inf, dist.l_inf);
|
|
+ dist.l_2 += d.l_2 * d.l_2;
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid input and output types.");
|
|
+ }
|
|
+ }
|
|
+ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
|
|
+ || (itype == fft_array_type_hermitian_interleaved
|
|
+ && otype == fft_array_type_hermitian_planar))
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
|
|
+ reinterpret_cast<const _Float16*>(output[0].data()),
|
|
+ reinterpret_cast<const _Float16*>(output[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
|
|
+ reinterpret_cast<const float*>(output[0].data()),
|
|
+ reinterpret_cast<const float*>(output[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
|
|
+ reinterpret_cast<const double*>(output[0].data()),
|
|
+ reinterpret_cast<const double*>(output[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ ostride,
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ }
|
|
+ dist.l_2 *= dist.l_2;
|
|
+ }
|
|
+ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
|
|
+ || (itype == fft_array_type_hermitian_planar
|
|
+ && otype == fft_array_type_hermitian_interleaved))
|
|
+ {
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ dist
|
|
+ = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
|
|
+ reinterpret_cast<const _Float16*>(input[0].data()),
|
|
+ reinterpret_cast<const _Float16*>(input[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ ostride,
|
|
+ odist,
|
|
+ istride,
|
|
+ idist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
|
|
+ reinterpret_cast<const float*>(input[0].data()),
|
|
+ reinterpret_cast<const float*>(input[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ ostride,
|
|
+ odist,
|
|
+ istride,
|
|
+ idist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
|
|
+ reinterpret_cast<const double*>(input[0].data()),
|
|
+ reinterpret_cast<const double*>(input[1].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ ostride,
|
|
+ odist,
|
|
+ istride,
|
|
+ idist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ break;
|
|
+ }
|
|
+ dist.l_2 *= dist.l_2;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ throw std::runtime_error("Invalid input and output types.");
|
|
+ }
|
|
+ dist.l_2 = sqrt(dist.l_2);
|
|
+ return dist;
|
|
+}
|
|
+
|
|
+// check if the specified length + stride/dist is contiguous
|
|
+template <typename Tint1, typename Tint2>
|
|
+bool is_contiguous_rowmajor(const std::vector<Tint1>& length,
|
|
+ const std::vector<Tint2>& stride,
|
|
+ size_t dist)
|
|
+{
|
|
+ size_t expected_stride = 1;
|
|
+ auto stride_it = stride.rbegin();
|
|
+ auto length_it = length.rbegin();
|
|
+ for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it)
|
|
+ {
|
|
+ if(*stride_it != expected_stride)
|
|
+ return false;
|
|
+ expected_stride *= *length_it;
|
|
+ }
|
|
+ return expected_stride == dist;
|
|
+}
|
|
+
|
|
+// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
|
|
+template <typename Tint1, typename Tint2, typename Tint3>
|
|
+inline VectorNorms distance(const std::vector<hostbuf>& input,
|
|
+ const std::vector<hostbuf>& output,
|
|
+ std::vector<Tint1> length,
|
|
+ size_t nbatch,
|
|
+ const fft_precision precision,
|
|
+ const fft_array_type itype,
|
|
+ std::vector<Tint2> istride,
|
|
+ const size_t idist,
|
|
+ const fft_array_type otype,
|
|
+ std::vector<Tint3> ostride,
|
|
+ const size_t odist,
|
|
+ std::vector<std::pair<size_t, size_t>>* linf_failures,
|
|
+ const double linf_cutoff,
|
|
+ const std::vector<size_t>& ioffset,
|
|
+ const std::vector<size_t>& ooffset,
|
|
+ const double output_scalar = 1.0)
|
|
+{
|
|
+ // If istride and ostride are both contiguous, collapse them down
|
|
+ // to one dimension. Index calculation is simpler (and faster)
|
|
+ // in the 1D case.
|
|
+ if(is_contiguous_rowmajor(length, istride, idist)
|
|
+ && is_contiguous_rowmajor(length, ostride, odist))
|
|
+ {
|
|
+ length = {product(length.begin(), length.end()) * nbatch};
|
|
+ istride = {static_cast<Tint2>(1)};
|
|
+ ostride = {static_cast<Tint3>(1)};
|
|
+ nbatch = 1;
|
|
+ }
|
|
+
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ return distance(input,
|
|
+ output,
|
|
+ length[0],
|
|
+ nbatch,
|
|
+ precision,
|
|
+ itype,
|
|
+ istride[0],
|
|
+ idist,
|
|
+ otype,
|
|
+ ostride[0],
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ case 2:
|
|
+ return distance(input,
|
|
+ output,
|
|
+ std::make_tuple(length[0], length[1]),
|
|
+ nbatch,
|
|
+ precision,
|
|
+ itype,
|
|
+ std::make_tuple(istride[0], istride[1]),
|
|
+ idist,
|
|
+ otype,
|
|
+ std::make_tuple(ostride[0], ostride[1]),
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ case 3:
|
|
+ return distance(input,
|
|
+ output,
|
|
+ std::make_tuple(length[0], length[1], length[2]),
|
|
+ nbatch,
|
|
+ precision,
|
|
+ itype,
|
|
+ std::make_tuple(istride[0], istride[1], istride[2]),
|
|
+ idist,
|
|
+ otype,
|
|
+ std::make_tuple(ostride[0], ostride[1], ostride[2]),
|
|
+ odist,
|
|
+ linf_failures,
|
|
+ linf_cutoff,
|
|
+ ioffset,
|
|
+ ooffset,
|
|
+ output_scalar);
|
|
+ default:
|
|
+ abort();
|
|
+ }
|
|
+}
|
|
+
|
|
+// Compute the L-infinity and L-2 norm of a buffer with strides istride and
|
|
+// length idist. Data is rocfft_complex.
|
|
+template <typename Tcomplex, typename T1, typename T2>
|
|
+inline VectorNorms norm_complex(const Tcomplex* input,
|
|
+ const T1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const T2& istride,
|
|
+ const size_t idist,
|
|
+ const std::vector<size_t>& offset)
|
|
+{
|
|
+ double linf = 0.0;
|
|
+ double l2 = 0.0;
|
|
+
|
|
+ size_t idx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ double cur_linf = 0.0;
|
|
+ double cur_l2 = 0.0;
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+
|
|
+ const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real()));
|
|
+ cur_linf = std::max(rval, cur_linf);
|
|
+ cur_l2 += rval * rval;
|
|
+
|
|
+ const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag()));
|
|
+ cur_linf = std::max(ival, cur_linf);
|
|
+ cur_l2 += ival * ival;
|
|
+
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ linf = std::max(linf, cur_linf);
|
|
+ l2 += cur_l2;
|
|
+ }
|
|
+ }
|
|
+ return {.l_2 = sqrt(l2), .l_inf = linf};
|
|
+}
|
|
+
|
|
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
|
|
+// length idist. Data is real-valued.
|
|
+template <typename Tfloat, typename T1, typename T2>
|
|
+inline VectorNorms norm_real(const Tfloat* input,
|
|
+ const T1& whole_length,
|
|
+ const size_t nbatch,
|
|
+ const T2& istride,
|
|
+ const size_t idist,
|
|
+ const std::vector<size_t>& offset)
|
|
+{
|
|
+ double linf = 0.0;
|
|
+ double l2 = 0.0;
|
|
+
|
|
+ size_t idx_base = 0;
|
|
+ auto partitions = partition_rowmajor(whole_length);
|
|
+ for(size_t b = 0; b < nbatch; b++, idx_base += idist)
|
|
+ {
|
|
+#ifdef _OPENMP
|
|
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
|
|
+#endif
|
|
+ for(size_t part = 0; part < partitions.size(); ++part)
|
|
+ {
|
|
+ double cur_linf = 0.0;
|
|
+ double cur_l2 = 0.0;
|
|
+ auto index = partitions[part].first;
|
|
+ const auto length = partitions[part].second;
|
|
+ do
|
|
+ {
|
|
+ const auto idx = compute_index(index, istride, idx_base);
|
|
+ const double val = std::abs(static_cast<double>(input[idx + offset[0]]));
|
|
+ cur_linf = std::max(val, cur_linf);
|
|
+ cur_l2 += val * val;
|
|
+
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ linf = std::max(linf, cur_linf);
|
|
+ l2 += cur_l2;
|
|
+ }
|
|
+ }
|
|
+ return {.l_2 = sqrt(l2), .l_inf = linf};
|
|
+}
|
|
+
|
|
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
|
|
+// length idist. Data format is given by precision and itype.
|
|
+template <typename T1, typename T2>
|
|
+inline VectorNorms norm(const std::vector<hostbuf>& input,
|
|
+ const T1& length,
|
|
+ const size_t nbatch,
|
|
+ const fft_precision precision,
|
|
+ const fft_array_type itype,
|
|
+ const T2& istride,
|
|
+ const size_t idist,
|
|
+ const std::vector<size_t>& offset)
|
|
+{
|
|
+ VectorNorms norm;
|
|
+
|
|
+ switch(itype)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ offset);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ offset);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ offset);
|
|
+ break;
|
|
+ }
|
|
+ norm.l_2 *= norm.l_2;
|
|
+ break;
|
|
+ case fft_array_type_real:
|
|
+ case fft_array_type_complex_planar:
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ for(unsigned int idx = 0; idx < input.size(); ++idx)
|
|
+ {
|
|
+ VectorNorms n;
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ offset);
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ offset);
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
|
|
+ length,
|
|
+ nbatch,
|
|
+ istride,
|
|
+ idist,
|
|
+ offset);
|
|
+ break;
|
|
+ }
|
|
+ norm.l_inf = std::max(n.l_inf, norm.l_inf);
|
|
+ norm.l_2 += n.l_2 * n.l_2;
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid data type");
|
|
+ }
|
|
+
|
|
+ norm.l_2 = sqrt(norm.l_2);
|
|
+ return norm;
|
|
+}
|
|
+
|
|
+// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
|
|
+template <typename T1, typename T2>
|
|
+inline VectorNorms norm(const std::vector<hostbuf>& input,
|
|
+ std::vector<T1> length,
|
|
+ size_t nbatch,
|
|
+ const fft_precision precision,
|
|
+ const fft_array_type type,
|
|
+ std::vector<T2> stride,
|
|
+ const size_t dist,
|
|
+ const std::vector<size_t>& offset)
|
|
+{
|
|
+ // If stride is contiguous, collapse it down to one dimension.
|
|
+ // Index calculation is simpler (and faster) in the 1D case.
|
|
+ if(is_contiguous_rowmajor(length, stride, dist))
|
|
+ {
|
|
+ length = {product(length.begin(), length.end()) * nbatch};
|
|
+ stride = {static_cast<T2>(1)};
|
|
+ nbatch = 1;
|
|
+ }
|
|
+
|
|
+ switch(length.size())
|
|
+ {
|
|
+ case 1:
|
|
+ return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
|
|
+ case 2:
|
|
+ return norm(input,
|
|
+ std::make_tuple(length[0], length[1]),
|
|
+ nbatch,
|
|
+ precision,
|
|
+ type,
|
|
+ std::make_tuple(stride[0], stride[1]),
|
|
+ dist,
|
|
+ offset);
|
|
+ case 3:
|
|
+ return norm(input,
|
|
+ std::make_tuple(length[0], length[1], length[2]),
|
|
+ nbatch,
|
|
+ precision,
|
|
+ type,
|
|
+ std::make_tuple(stride[0], stride[1], stride[2]),
|
|
+ dist,
|
|
+ offset);
|
|
+ default:
|
|
+ abort();
|
|
+ }
|
|
+}
|
|
+
|
|
+// Given a data type and precision, the distance between batches, and
|
|
+// the batch size, allocate the required host buffer(s).
|
|
+static std::vector<hostbuf> allocate_host_buffer(const fft_precision precision,
|
|
+ const fft_array_type type,
|
|
+ const std::vector<size_t>& size)
|
|
+{
|
|
+ std::vector<hostbuf> buffers(size.size());
|
|
+ for(unsigned int i = 0; i < size.size(); ++i)
|
|
+ {
|
|
+ buffers[i].alloc(size[i] * var_size<size_t>(precision, type));
|
|
+ }
|
|
+ return buffers;
|
|
+}
|
|
+
|
|
+// Check if the required buffers fit in the device vram.
|
|
+inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0)
|
|
+{
|
|
+ // We keep a small margin of error for fitting the problem into vram:
|
|
+ const size_t extra = 1 << 27;
|
|
+
|
|
+ return vram_avail > prob_size + extra;
|
|
+}
|
|
+
|
|
+// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
|
|
+// This function will return 0 for the other transform types, since
|
|
+// the VRAM footprint in rocFFT is negligible for the other cases.
|
|
+inline size_t twiddle_table_vram_footprint(const fft_params& params)
|
|
+{
|
|
+ size_t vram_footprint = 0;
|
|
+
|
|
+ // Add vram footprint from real/complex even twiddle buffer size.
|
|
+ if(params.transform_type == fft_transform_type_real_forward
|
|
+ || params.transform_type == fft_transform_type_real_inverse)
|
|
+ {
|
|
+ const auto realdim = params.length.back();
|
|
+ if(realdim % 2 == 0)
|
|
+ {
|
|
+ const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
|
|
+ // even length twiddle size is 1/4 of the real size, but
|
|
+ // in complex elements
|
|
+ vram_footprint += realdim * complex_size / 4;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return vram_footprint;
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h
|
|
new file mode 100644
|
|
index 0000000..873a373
|
|
--- /dev/null
|
|
+++ b/shared/fftw_transform.h
|
|
@@ -0,0 +1,493 @@
|
|
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+#ifndef FFTWTRANSFORM_H
|
|
+#define FFTWTRANSFORM_H
|
|
+
|
|
+#include "hostbuf.h"
|
|
+#include "rocfft_complex.h"
|
|
+#include "test_params.h"
|
|
+#include <fftw3.h>
|
|
+#include <vector>
|
|
+
|
|
+// Function to return maximum error for float and double types.
|
|
+//
|
|
+// Following Schatzman (1996; Accuracy of the Discrete Fourier
|
|
+// Transform and the Fast Fourier Transform), the shape of relative
|
|
+// l_2 error vs length should look like
|
|
+//
|
|
+// epsilon * sqrt(log2(length)).
|
|
+//
|
|
+// The magic epsilon constants below were chosen so that we get a
|
|
+// reasonable upper bound for (all of) our tests.
|
|
+//
|
|
+// For rocFFT, prime lengths result in the highest error. As such,
|
|
+// the epsilons below are perhaps too loose for pow2 lengths; but they
|
|
+// are appropriate for prime lengths.
|
|
+template <typename Tfloat>
|
|
+inline double type_epsilon();
|
|
+template <>
|
|
+inline double type_epsilon<_Float16>()
|
|
+{
|
|
+ return half_epsilon;
|
|
+}
|
|
+template <>
|
|
+inline double type_epsilon<float>()
|
|
+{
|
|
+ return single_epsilon;
|
|
+}
|
|
+template <>
|
|
+inline double type_epsilon<double>()
|
|
+{
|
|
+ return double_epsilon;
|
|
+}
|
|
+
|
|
+// C++ traits to translate float->fftwf_complex and
|
|
+// double->fftw_complex.
|
|
+// The correct FFTW complex type can be accessed via, for example,
|
|
+// using complex_t = typename fftw_complex_trait<Tfloat>::complex_t;
|
|
+template <typename Tfloat>
|
|
+struct fftw_trait;
|
|
+template <>
|
|
+struct fftw_trait<_Float16>
|
|
+{
|
|
+ // fftw does not support half precision, so use single precision and convert
|
|
+ using fftw_complex_type = fftwf_complex;
|
|
+ using fftw_plan_type = fftwf_plan;
|
|
+};
|
|
+template <>
|
|
+struct fftw_trait<float>
|
|
+{
|
|
+ using fftw_complex_type = fftwf_complex;
|
|
+ using fftw_plan_type = fftwf_plan;
|
|
+};
|
|
+template <>
|
|
+struct fftw_trait<double>
|
|
+{
|
|
+ using fftw_complex_type = fftw_complex;
|
|
+ using fftw_plan_type = fftw_plan;
|
|
+};
|
|
+
|
|
+// Copies the half-precision input buffer to a single-precision
|
|
+// buffer. Note that the input buffer is already sized like it's a
|
|
+// single-precision buffer (but only half of it is filled), because
|
|
+// we allocate a single-precision buffer for FFTW to plan with.
|
|
+static hostbuf half_to_single_copy(const hostbuf& in)
|
|
+{
|
|
+ auto out = in.copy();
|
|
+ auto in_begin = reinterpret_cast<const _Float16*>(in.data());
|
|
+ std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data()));
|
|
+ return out;
|
|
+}
|
|
+
|
|
+// converts a wider precision buffer to a narrower precision, in-place
|
|
+template <typename TfloatIn, typename TfloatOut>
|
|
+void narrow_precision_inplace(hostbuf& in)
|
|
+{
|
|
+ // ensure we're actually shrinking the data
|
|
+ static_assert(sizeof(TfloatIn) > sizeof(TfloatOut));
|
|
+
|
|
+ auto readPtr = reinterpret_cast<const TfloatIn*>(in.data());
|
|
+ auto writePtr = reinterpret_cast<TfloatOut*>(in.data());
|
|
+ std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr);
|
|
+ in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut)));
|
|
+}
|
|
+
|
|
+static void single_to_half_inplace(hostbuf& in)
|
|
+{
|
|
+ narrow_precision_inplace<float, _Float16>(in);
|
|
+}
|
|
+
|
|
+// Template wrappers for real-valued FFTW allocators:
|
|
+template <typename Tfloat>
|
|
+inline Tfloat* fftw_alloc_real_type(size_t n);
|
|
+template <>
|
|
+inline float* fftw_alloc_real_type<float>(size_t n)
|
|
+{
|
|
+ return fftwf_alloc_real(n);
|
|
+}
|
|
+template <>
|
|
+inline double* fftw_alloc_real_type<double>(size_t n)
|
|
+{
|
|
+ return fftw_alloc_real(n);
|
|
+}
|
|
+
|
|
+// Template wrappers for complex-valued FFTW allocators:
|
|
+template <typename Tfloat>
|
|
+inline typename fftw_trait<Tfloat>::fftw_complex_type* fftw_alloc_complex_type(size_t n);
|
|
+template <>
|
|
+inline typename fftw_trait<float>::fftw_complex_type* fftw_alloc_complex_type<float>(size_t n)
|
|
+{
|
|
+ return fftwf_alloc_complex(n);
|
|
+}
|
|
+template <>
|
|
+inline typename fftw_trait<double>::fftw_complex_type* fftw_alloc_complex_type<double>(size_t n)
|
|
+{
|
|
+ return fftw_alloc_complex(n);
|
|
+}
|
|
+
|
|
+template <typename fftw_type>
|
|
+inline fftw_type* fftw_alloc_type(size_t n);
|
|
+template <>
|
|
+inline float* fftw_alloc_type<float>(size_t n)
|
|
+{
|
|
+ return fftw_alloc_real_type<float>(n);
|
|
+}
|
|
+template <>
|
|
+inline double* fftw_alloc_type<double>(size_t n)
|
|
+{
|
|
+ return fftw_alloc_real_type<double>(n);
|
|
+}
|
|
+template <>
|
|
+inline fftwf_complex* fftw_alloc_type<fftwf_complex>(size_t n)
|
|
+{
|
|
+ return fftw_alloc_complex_type<float>(n);
|
|
+}
|
|
+template <>
|
|
+inline fftw_complex* fftw_alloc_type<fftw_complex>(size_t n)
|
|
+{
|
|
+ return fftw_alloc_complex_type<double>(n);
|
|
+}
|
|
+template <>
|
|
+inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n)
|
|
+{
|
|
+ return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n);
|
|
+}
|
|
+template <>
|
|
+inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n)
|
|
+{
|
|
+ return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n);
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW plan executors:
|
|
+template <typename Tfloat>
|
|
+inline void fftw_execute_type(typename fftw_trait<Tfloat>::fftw_plan_type plan);
|
|
+template <>
|
|
+inline void fftw_execute_type<float>(typename fftw_trait<float>::fftw_plan_type plan)
|
|
+{
|
|
+ return fftwf_execute(plan);
|
|
+}
|
|
+template <>
|
|
+inline void fftw_execute_type<double>(typename fftw_trait<double>::fftw_plan_type plan)
|
|
+{
|
|
+ return fftw_execute(plan);
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW plan destroyers:
|
|
+template <typename Tfftw_plan>
|
|
+inline void fftw_destroy_plan_type(Tfftw_plan plan);
|
|
+template <>
|
|
+inline void fftw_destroy_plan_type<fftwf_plan>(fftwf_plan plan)
|
|
+{
|
|
+ return fftwf_destroy_plan(plan);
|
|
+}
|
|
+template <>
|
|
+inline void fftw_destroy_plan_type<fftw_plan>(fftw_plan plan)
|
|
+{
|
|
+ return fftw_destroy_plan(plan);
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW c2c planners:
|
|
+template <typename Tfloat>
|
|
+inline typename fftw_trait<Tfloat>::fftw_plan_type
|
|
+ fftw_plan_guru64_dft(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<Tfloat>::fftw_complex_type* in,
|
|
+ typename fftw_trait<Tfloat>::fftw_complex_type* out,
|
|
+ int sign,
|
|
+ unsigned flags);
|
|
+
|
|
+template <>
|
|
+inline typename fftw_trait<_Float16>::fftw_plan_type
|
|
+ fftw_plan_guru64_dft<_Float16>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<_Float16>::fftw_complex_type* in,
|
|
+ typename fftw_trait<_Float16>::fftw_complex_type* out,
|
|
+ int sign,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
|
|
+}
|
|
+
|
|
+template <>
|
|
+inline typename fftw_trait<float>::fftw_plan_type
|
|
+ fftw_plan_guru64_dft<float>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<float>::fftw_complex_type* in,
|
|
+ typename fftw_trait<float>::fftw_complex_type* out,
|
|
+ int sign,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
|
|
+}
|
|
+
|
|
+template <>
|
|
+inline typename fftw_trait<double>::fftw_plan_type
|
|
+ fftw_plan_guru64_dft<double>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<double>::fftw_complex_type* in,
|
|
+ typename fftw_trait<double>::fftw_complex_type* out,
|
|
+ int sign,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW c2c executors:
|
|
+template <typename Tfloat>
|
|
+inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out);
|
|
+
|
|
+template <>
|
|
+inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ // since FFTW does not natively support half precision, convert
|
|
+ // input to single, execute, then convert output back to half
|
|
+ auto in_single = half_to_single_copy(in.front());
|
|
+ fftwf_execute_dft(plan,
|
|
+ reinterpret_cast<fftwf_complex*>(in_single.data()),
|
|
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
|
|
+ single_to_half_inplace(out.front());
|
|
+}
|
|
+
|
|
+template <>
|
|
+inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ fftwf_execute_dft(plan,
|
|
+ reinterpret_cast<fftwf_complex*>(in.front().data()),
|
|
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
|
|
+}
|
|
+
|
|
+template <>
|
|
+inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ fftw_execute_dft(plan,
|
|
+ reinterpret_cast<fftw_complex*>(in.front().data()),
|
|
+ reinterpret_cast<fftw_complex*>(out.front().data()));
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW r2c planners:
|
|
+template <typename Tfloat>
|
|
+inline typename fftw_trait<Tfloat>::fftw_plan_type
|
|
+ fftw_plan_guru64_r2c(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ Tfloat* in,
|
|
+ typename fftw_trait<Tfloat>::fftw_complex_type* out,
|
|
+ unsigned flags);
|
|
+template <>
|
|
+inline typename fftw_trait<_Float16>::fftw_plan_type
|
|
+ fftw_plan_guru64_r2c<_Float16>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ _Float16* in,
|
|
+ typename fftw_trait<_Float16>::fftw_complex_type* out,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftwf_plan_guru64_dft_r2c(
|
|
+ rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags);
|
|
+}
|
|
+template <>
|
|
+inline typename fftw_trait<float>::fftw_plan_type
|
|
+ fftw_plan_guru64_r2c<float>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ float* in,
|
|
+ typename fftw_trait<float>::fftw_complex_type* out,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
|
|
+}
|
|
+template <>
|
|
+inline typename fftw_trait<double>::fftw_plan_type
|
|
+ fftw_plan_guru64_r2c<double>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ double* in,
|
|
+ typename fftw_trait<double>::fftw_complex_type* out,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW r2c executors:
|
|
+template <typename Tfloat>
|
|
+inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out);
|
|
+template <>
|
|
+inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ // since FFTW does not natively support half precision, convert
|
|
+ // input to single, execute, then convert output back to half
|
|
+ auto in_single = half_to_single_copy(in.front());
|
|
+ fftwf_execute_dft_r2c(plan,
|
|
+ reinterpret_cast<float*>(in_single.data()),
|
|
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
|
|
+ single_to_half_inplace(out.front());
|
|
+}
|
|
+template <>
|
|
+inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ fftwf_execute_dft_r2c(plan,
|
|
+ reinterpret_cast<float*>(in.front().data()),
|
|
+ reinterpret_cast<fftwf_complex*>(out.front().data()));
|
|
+}
|
|
+template <>
|
|
+inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ fftw_execute_dft_r2c(plan,
|
|
+ reinterpret_cast<double*>(in.front().data()),
|
|
+ reinterpret_cast<fftw_complex*>(out.front().data()));
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW c2r planners:
|
|
+template <typename Tfloat>
|
|
+inline typename fftw_trait<Tfloat>::fftw_plan_type
|
|
+ fftw_plan_guru64_c2r(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<Tfloat>::fftw_complex_type* in,
|
|
+ Tfloat* out,
|
|
+ unsigned flags);
|
|
+template <>
|
|
+inline typename fftw_trait<_Float16>::fftw_plan_type
|
|
+ fftw_plan_guru64_c2r<_Float16>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<_Float16>::fftw_complex_type* in,
|
|
+ _Float16* out,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftwf_plan_guru64_dft_c2r(
|
|
+ rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags);
|
|
+}
|
|
+template <>
|
|
+inline typename fftw_trait<float>::fftw_plan_type
|
|
+ fftw_plan_guru64_c2r<float>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<float>::fftw_complex_type* in,
|
|
+ float* out,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
|
|
+}
|
|
+template <>
|
|
+inline typename fftw_trait<double>::fftw_plan_type
|
|
+ fftw_plan_guru64_c2r<double>(int rank,
|
|
+ const fftw_iodim64* dims,
|
|
+ int howmany_rank,
|
|
+ const fftw_iodim64* howmany_dims,
|
|
+ typename fftw_trait<double>::fftw_complex_type* in,
|
|
+ double* out,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
|
|
+}
|
|
+
|
|
+// Template wrappers for FFTW c2r executors:
|
|
+template <typename Tfloat>
|
|
+inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out);
|
|
+template <>
|
|
+inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ // since FFTW does not natively support half precision, convert
|
|
+ // input to single, execute, then convert output back to half
|
|
+ auto in_single = half_to_single_copy(in.front());
|
|
+ fftwf_execute_dft_c2r(plan,
|
|
+ reinterpret_cast<fftwf_complex*>(in_single.data()),
|
|
+ reinterpret_cast<float*>(out.front().data()));
|
|
+ single_to_half_inplace(out.front());
|
|
+}
|
|
+template <>
|
|
+inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ fftwf_execute_dft_c2r(plan,
|
|
+ reinterpret_cast<fftwf_complex*>(in.front().data()),
|
|
+ reinterpret_cast<float*>(out.front().data()));
|
|
+}
|
|
+template <>
|
|
+inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan,
|
|
+ std::vector<hostbuf>& in,
|
|
+ std::vector<hostbuf>& out)
|
|
+{
|
|
+ fftw_execute_dft_c2r(plan,
|
|
+ reinterpret_cast<fftw_complex*>(in.front().data()),
|
|
+ reinterpret_cast<double*>(out.front().data()));
|
|
+}
|
|
+
|
|
+#ifdef FFTW_HAVE_SPRINT_PLAN
|
|
+// Template wrappers for FFTW print plan:
|
|
+template <typename Tfloat>
|
|
+inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan);
|
|
+template <>
|
|
+inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan)
|
|
+{
|
|
+ return fftwf_sprint_plan(plan);
|
|
+}
|
|
+template <>
|
|
+inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan)
|
|
+{
|
|
+ return fftwf_sprint_plan(plan);
|
|
+}
|
|
+template <>
|
|
+inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan)
|
|
+{
|
|
+ return fftw_sprint_plan(plan);
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif
|
|
diff --git a/shared/gpubuf.h b/shared/gpubuf.h
|
|
new file mode 100644
|
|
index 0000000..993fa95
|
|
--- /dev/null
|
|
+++ b/shared/gpubuf.h
|
|
@@ -0,0 +1,134 @@
|
|
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_GPUBUF_H
|
|
+#define ROCFFT_GPUBUF_H
|
|
+
|
|
+#include "rocfft_hip.h"
|
|
+#include <cstdlib>
|
|
+
|
|
+// Simple RAII class for GPU buffers. T is the type of pointer that
|
|
+// data() returns
|
|
+template <class T = void>
|
|
+class gpubuf_t
|
|
+{
|
|
+public:
|
|
+ gpubuf_t() {}
|
|
+ // buffers are movable but not copyable
|
|
+ gpubuf_t(gpubuf_t&& other)
|
|
+ {
|
|
+ std::swap(buf, other.buf);
|
|
+ std::swap(bsize, other.bsize);
|
|
+ std::swap(device, other.device);
|
|
+ }
|
|
+ gpubuf_t& operator=(gpubuf_t&& other)
|
|
+ {
|
|
+ std::swap(buf, other.buf);
|
|
+ std::swap(bsize, other.bsize);
|
|
+ std::swap(device, other.device);
|
|
+ return *this;
|
|
+ }
|
|
+ gpubuf_t(const gpubuf_t&) = delete;
|
|
+ gpubuf_t& operator=(const gpubuf_t&) = delete;
|
|
+
|
|
+ ~gpubuf_t()
|
|
+ {
|
|
+ free();
|
|
+ }
|
|
+
|
|
+ static bool use_alloc_managed()
|
|
+ {
|
|
+ return std::getenv("ROCFFT_MALLOC_MANAGED");
|
|
+ }
|
|
+
|
|
+ hipError_t alloc(const size_t size)
|
|
+ {
|
|
+ // remember the device that was current as of alloc, so we can
|
|
+ // free on the correct device
|
|
+ auto ret = hipGetDevice(&device);
|
|
+ if(ret != hipSuccess)
|
|
+ return ret;
|
|
+
|
|
+ bsize = size;
|
|
+ static bool alloc_managed = use_alloc_managed();
|
|
+ free();
|
|
+ ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize);
|
|
+ if(ret != hipSuccess)
|
|
+ {
|
|
+ buf = nullptr;
|
|
+ bsize = 0;
|
|
+ }
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ size_t size() const
|
|
+ {
|
|
+ return bsize;
|
|
+ }
|
|
+
|
|
+ void free()
|
|
+ {
|
|
+ if(buf != nullptr)
|
|
+ {
|
|
+ // free on the device we allocated on
|
|
+ rocfft_scoped_device dev(device);
|
|
+ (void)hipFree(buf);
|
|
+ buf = nullptr;
|
|
+ bsize = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // return a pointer to the allocated memory, offset by the
|
|
+ // specified number of bytes
|
|
+ T* data_offset(size_t offset_bytes = 0) const
|
|
+ {
|
|
+ void* ptr = static_cast<char*>(buf) + offset_bytes;
|
|
+ return static_cast<T*>(ptr);
|
|
+ }
|
|
+
|
|
+ T* data() const
|
|
+ {
|
|
+ return static_cast<T*>(buf);
|
|
+ }
|
|
+
|
|
+ // equality/bool tests
|
|
+ bool operator==(std::nullptr_t n) const
|
|
+ {
|
|
+ return buf == n;
|
|
+ }
|
|
+ bool operator!=(std::nullptr_t n) const
|
|
+ {
|
|
+ return buf != n;
|
|
+ }
|
|
+ operator bool() const
|
|
+ {
|
|
+ return buf;
|
|
+ }
|
|
+
|
|
+private:
|
|
+ // The GPU buffer
|
|
+ void* buf = nullptr;
|
|
+ size_t bsize = 0;
|
|
+ int device = 0;
|
|
+};
|
|
+
|
|
+// default gpubuf that gives out void* pointers
|
|
+typedef gpubuf_t<> gpubuf;
|
|
+#endif
|
|
diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h
|
|
new file mode 100644
|
|
index 0000000..54083ab
|
|
--- /dev/null
|
|
+++ b/shared/hip_object_wrapper.h
|
|
@@ -0,0 +1,86 @@
|
|
+/******************************************************************************
|
|
+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+*
|
|
+* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+* of this software and associated documentation files (the "Software"), to deal
|
|
+* in the Software without restriction, including without limitation the rights
|
|
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+* copies of the Software, and to permit persons to whom the Software is
|
|
+* furnished to do so, subject to the following conditions:
|
|
+*
|
|
+* The above copyright notice and this permission notice shall be included in
|
|
+* all copies or substantial portions of the Software.
|
|
+*
|
|
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+* THE SOFTWARE.
|
|
+*******************************************************************************/
|
|
+
|
|
+#ifndef ROCFFT_HIP_OBJ_WRAPPER_H
|
|
+#define ROCFFT_HIP_OBJ_WRAPPER_H
|
|
+
|
|
+#include "rocfft_hip.h"
|
|
+
|
|
+// RAII wrapper around HIP objects
|
|
+template <typename T, auto TCreate, auto TDestroy>
|
|
+struct hip_object_wrapper_t
|
|
+{
|
|
+ hip_object_wrapper_t()
|
|
+ : obj(nullptr)
|
|
+ {
|
|
+ }
|
|
+
|
|
+ void alloc()
|
|
+ {
|
|
+ if(obj == nullptr && TCreate(&obj) != hipSuccess)
|
|
+ throw std::runtime_error("hip create failure");
|
|
+ }
|
|
+
|
|
+ void free()
|
|
+ {
|
|
+ if(obj)
|
|
+ {
|
|
+ (void)TDestroy(obj);
|
|
+ obj = nullptr;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ operator const T&() const
|
|
+ {
|
|
+ return obj;
|
|
+ }
|
|
+ operator T&()
|
|
+ {
|
|
+ return obj;
|
|
+ }
|
|
+
|
|
+ operator bool() const
|
|
+ {
|
|
+ return obj != nullptr;
|
|
+ }
|
|
+
|
|
+ ~hip_object_wrapper_t()
|
|
+ {
|
|
+ free();
|
|
+ }
|
|
+
|
|
+ hip_object_wrapper_t(const hip_object_wrapper_t&) = delete;
|
|
+ hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete;
|
|
+ hip_object_wrapper_t(hip_object_wrapper_t&& other)
|
|
+ : obj(other.obj)
|
|
+ {
|
|
+ other.obj = nullptr;
|
|
+ }
|
|
+
|
|
+private:
|
|
+ T obj;
|
|
+};
|
|
+
|
|
+typedef hip_object_wrapper_t<hipStream_t, hipStreamCreate, hipStreamDestroy> hipStream_wrapper_t;
|
|
+typedef hip_object_wrapper_t<hipEvent_t, hipEventCreate, hipEventDestroy> hipEvent_wrapper_t;
|
|
+
|
|
+#endif // ROCFFT_HIP_OBJ_WRAPPER_H
|
|
diff --git a/shared/hostbuf.h b/shared/hostbuf.h
|
|
new file mode 100644
|
|
index 0000000..0a96c7d
|
|
--- /dev/null
|
|
+++ b/shared/hostbuf.h
|
|
@@ -0,0 +1,158 @@
|
|
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_HOSTBUF_H
|
|
+#define ROCFFT_HOSTBUF_H
|
|
+
|
|
+#include "arithmetic.h"
|
|
+#include <cstdlib>
|
|
+#include <cstring>
|
|
+
|
|
+#ifndef WIN32
|
|
+#include <stdlib.h>
|
|
+#include <sys/mman.h>
|
|
+#endif
|
|
+
|
|
+// Simple RAII class for host buffers. T is the type of pointer that
|
|
+// data() returns
|
|
+template <class T = void>
|
|
+class hostbuf_t
|
|
+{
|
|
+public:
|
|
+ hostbuf_t() {}
|
|
+ // buffers are movable but not copyable
|
|
+ hostbuf_t(hostbuf_t&& other)
|
|
+ {
|
|
+ std::swap(buf, other.buf);
|
|
+ std::swap(bsize, other.bsize);
|
|
+ }
|
|
+ hostbuf_t& operator=(hostbuf_t&& other)
|
|
+ {
|
|
+ std::swap(buf, other.buf);
|
|
+ std::swap(bsize, other.bsize);
|
|
+ return *this;
|
|
+ }
|
|
+ hostbuf_t(const hostbuf_t&) = delete;
|
|
+ hostbuf_t& operator=(const hostbuf_t&) = delete;
|
|
+
|
|
+ ~hostbuf_t()
|
|
+ {
|
|
+ free();
|
|
+ }
|
|
+
|
|
+ void alloc(size_t size)
|
|
+ {
|
|
+ bsize = size;
|
|
+ free();
|
|
+
|
|
+ // we're aligning to multiples of 64 bytes, so round the
|
|
+ // allocation size up to the nearest 64 to keep ASAN happy
|
|
+ if(size % 64)
|
|
+ {
|
|
+ size += 64 - size % 64;
|
|
+ }
|
|
+
|
|
+ // FFTW requires aligned allocations to use faster SIMD instructions.
|
|
+ // If enabling hugepages, align to 2 MiB. Otherwise, aligning to
|
|
+ // 64 bytes is enough for AVX instructions up to AVX512.
|
|
+#ifdef WIN32
|
|
+ buf = _aligned_malloc(size, 64);
|
|
+#else
|
|
+ // On Linux, ask for hugepages to reduce TLB pressure and
|
|
+ // improve performance. Allocations need to be aligned to
|
|
+ // the hugepage size, and rounded up to the next whole
|
|
+ // hugepage.
|
|
+ static const size_t TWO_MiB = 2 * 1024 * 1024;
|
|
+ if(size >= TWO_MiB)
|
|
+ {
|
|
+ size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB;
|
|
+ buf = aligned_alloc(TWO_MiB, rounded_size);
|
|
+ madvise(buf, rounded_size, MADV_HUGEPAGE);
|
|
+ }
|
|
+ else
|
|
+ buf = aligned_alloc(64, size);
|
|
+#endif
|
|
+ }
|
|
+
|
|
+ size_t size() const
|
|
+ {
|
|
+ return bsize;
|
|
+ }
|
|
+
|
|
+ void free()
|
|
+ {
|
|
+ if(buf != nullptr)
|
|
+ {
|
|
+#ifdef WIN32
|
|
+ _aligned_free(buf);
|
|
+#else
|
|
+ std::free(buf);
|
|
+#endif
|
|
+ buf = nullptr;
|
|
+ bsize = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ T* data() const
|
|
+ {
|
|
+ return static_cast<T*>(buf);
|
|
+ }
|
|
+
|
|
+ // Copy method
|
|
+ hostbuf_t copy() const
|
|
+ {
|
|
+ hostbuf_t copy;
|
|
+ copy.alloc(bsize);
|
|
+ memcpy(copy.buf, buf, bsize);
|
|
+ return copy;
|
|
+ }
|
|
+
|
|
+ // shrink the buffer to fit the new size
|
|
+ void shrink(size_t new_size)
|
|
+ {
|
|
+ if(new_size > bsize)
|
|
+ throw std::runtime_error("can't shrink hostbuf to larger size");
|
|
+ // just pretend the buffer is now that size
|
|
+ bsize = new_size;
|
|
+ }
|
|
+
|
|
+ // equality/bool tests
|
|
+ bool operator==(std::nullptr_t n) const
|
|
+ {
|
|
+ return buf == n;
|
|
+ }
|
|
+ bool operator!=(std::nullptr_t n) const
|
|
+ {
|
|
+ return buf != n;
|
|
+ }
|
|
+ operator bool() const
|
|
+ {
|
|
+ return buf;
|
|
+ }
|
|
+
|
|
+private:
|
|
+ // The host buffer
|
|
+ void* buf = nullptr;
|
|
+ size_t bsize = 0;
|
|
+};
|
|
+
|
|
+// default hostbuf that gives out void* pointers
|
|
+typedef hostbuf_t<> hostbuf;
|
|
+#endif
|
|
diff --git a/shared/increment.h b/shared/increment.h
|
|
new file mode 100644
|
|
index 0000000..90bba1d
|
|
--- /dev/null
|
|
+++ b/shared/increment.h
|
|
@@ -0,0 +1,100 @@
|
|
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_INCREMENT_H
|
|
+#define ROCFFT_INCREMENT_H
|
|
+
|
|
+#include <algorithm>
|
|
+#include <tuple>
|
|
+#include <vector>
|
|
+
|
|
+// Helper functions to iterate over a buffer in row-major order.
|
|
+// Indexes may be given as either a tuple or vector of sizes. They
|
|
+// return true if the index was successfully incremented to move to
|
|
+// the next element in the buffer.
|
|
+
|
|
+template <typename T1, typename T2>
|
|
+static bool increment_base(T1& index, const T2& length)
|
|
+{
|
|
+ static_assert(std::is_integral<T1>::value, "Integral required.");
|
|
+ static_assert(std::is_integral<T2>::value, "Integral required.");
|
|
+ if(index < length - 1)
|
|
+ {
|
|
+ ++index;
|
|
+ return true;
|
|
+ }
|
|
+ index = 0;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length.
|
|
+template <typename T1, typename T2>
|
|
+static bool increment_rowmajor(T1& index, const T2& length)
|
|
+{
|
|
+ static_assert(std::is_integral<T1>::value, "Integral required.");
|
|
+ static_assert(std::is_integral<T2>::value, "Integral required.");
|
|
+ return increment_base(index, length);
|
|
+}
|
|
+
|
|
+template <typename T1, typename T2>
|
|
+static bool increment_rowmajor(std::tuple<T1, T1>& index, const std::tuple<T2, T2>& length)
|
|
+{
|
|
+ if(increment_base(std::get<1>(index), std::get<1>(length)))
|
|
+ // we incremented ok, nothing further to do
|
|
+ return true;
|
|
+ // otherwise, we rolled over
|
|
+ return increment_base(std::get<0>(index), std::get<0>(length));
|
|
+}
|
|
+
|
|
+template <typename T1, typename T2>
|
|
+static bool increment_rowmajor(std::tuple<T1, T1, T1>& index, const std::tuple<T2, T2, T2>& length)
|
|
+{
|
|
+ if(increment_base(std::get<2>(index), std::get<2>(length)))
|
|
+ // we incremented ok, nothing further to do
|
|
+ return true;
|
|
+ if(increment_base(std::get<1>(index), std::get<1>(length)))
|
|
+ // we incremented ok, nothing further to do
|
|
+ return true;
|
|
+ // otherwise, we rolled over
|
|
+ return increment_base(std::get<0>(index), std::get<0>(length));
|
|
+}
|
|
+
|
|
+// Increment row-major index over arbitrary dimension length
|
|
+template <typename T1, typename T2>
|
|
+bool increment_rowmajor(std::vector<T1>& index, const std::vector<T2>& length)
|
|
+{
|
|
+ for(int idim = length.size(); idim-- > 0;)
|
|
+ {
|
|
+ if(index[idim] < length[idim])
|
|
+ {
|
|
+ if((++index[idim]) == length[idim])
|
|
+ {
|
|
+ index[idim] = 0;
|
|
+ continue;
|
|
+ }
|
|
+ // we know we were able to increment something and didn't hit the end
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+ // End the loop when we get back to the start:
|
|
+ return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/shared/precision_type.h b/shared/precision_type.h
|
|
new file mode 100644
|
|
index 0000000..526fc9a
|
|
--- /dev/null
|
|
+++ b/shared/precision_type.h
|
|
@@ -0,0 +1,70 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_PRECISION_TYPE_H
|
|
+#define ROCFFT_PRECISION_TYPE_H
|
|
+
|
|
+#include "array_predicate.h"
|
|
+#include "rocfft/rocfft.h"
|
|
+
|
|
+static size_t real_type_size(rocfft_precision precision)
|
|
+{
|
|
+ switch(precision)
|
|
+ {
|
|
+ case rocfft_precision_half:
|
|
+ return 2;
|
|
+ case rocfft_precision_single:
|
|
+ return 4;
|
|
+ case rocfft_precision_double:
|
|
+ return 8;
|
|
+ }
|
|
+}
|
|
+
|
|
+static size_t complex_type_size(rocfft_precision precision)
|
|
+{
|
|
+ return real_type_size(precision) * 2;
|
|
+}
|
|
+
|
|
+static const char* precision_name(rocfft_precision precision)
|
|
+{
|
|
+ switch(precision)
|
|
+ {
|
|
+ case rocfft_precision_half:
|
|
+ return "half";
|
|
+ case rocfft_precision_single:
|
|
+ return "single";
|
|
+ case rocfft_precision_double:
|
|
+ return "double";
|
|
+ }
|
|
+}
|
|
+
|
|
+static size_t element_size(rocfft_precision precision, rocfft_array_type array_type)
|
|
+{
|
|
+ return array_type_is_complex(array_type) ? complex_type_size(precision)
|
|
+ : real_type_size(precision);
|
|
+}
|
|
+
|
|
+// offset a pointer by a number of elements, given the elements'
|
|
+// precision and type (complex or not)
|
|
+static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type)
|
|
+{
|
|
+ return static_cast<char*>(p) + elems * element_size(precision, type);
|
|
+}
|
|
+#endif
|
|
diff --git a/shared/printbuffer.h b/shared/printbuffer.h
|
|
new file mode 100644
|
|
index 0000000..5ae0b64
|
|
--- /dev/null
|
|
+++ b/shared/printbuffer.h
|
|
@@ -0,0 +1,108 @@
|
|
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef PRINTBUFFER_H
|
|
+#define PRINTBUFFER_H
|
|
+
|
|
+#include "hostbuf.h"
|
|
+#include "increment.h"
|
|
+#include <algorithm>
|
|
+#include <vector>
|
|
+
|
|
+// Output a formatted general-dimensional array with given length and stride in batches
|
|
+// separated by dist.
|
|
+template <typename Toutput, typename T1, typename T2, typename Tsize, typename Tstream>
|
|
+inline void printbuffer(const Toutput* output,
|
|
+ const std::vector<T1>& length,
|
|
+ const std::vector<T2>& stride,
|
|
+ const Tsize nbatch,
|
|
+ const Tsize dist,
|
|
+ const size_t offset,
|
|
+ Tstream& stream)
|
|
+{
|
|
+ auto i_base = 0;
|
|
+ for(unsigned int b = 0; b < nbatch; b++, i_base += dist)
|
|
+ {
|
|
+ std::vector<size_t> index(length.size());
|
|
+ std::fill(index.begin(), index.end(), 0);
|
|
+ do
|
|
+ {
|
|
+ const int i
|
|
+ = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset);
|
|
+ stream << output[i] << " ";
|
|
+ for(int li = index.size(); li-- > 0;)
|
|
+ {
|
|
+ if(index[li] == (length[li] - 1))
|
|
+ {
|
|
+ stream << "\n";
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ } while(increment_rowmajor(index, length));
|
|
+ stream << std::endl;
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename Telem>
|
|
+class buffer_printer
|
|
+{
|
|
+ // The scalar versions might be part of a planar format.
|
|
+public:
|
|
+ template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream>
|
|
+ static void print_buffer(const std::vector<hostbuf>& buf,
|
|
+ const std::vector<Tint1>& length,
|
|
+ const std::vector<Tint2>& stride,
|
|
+ const Tsize nbatch,
|
|
+ const Tsize dist,
|
|
+ const std::vector<size_t>& offset,
|
|
+ Tstream& stream = std::cout)
|
|
+ {
|
|
+ for(const auto& vec : buf)
|
|
+ {
|
|
+ printbuffer(reinterpret_cast<const Telem*>(vec.data()),
|
|
+ length,
|
|
+ stride,
|
|
+ nbatch,
|
|
+ dist,
|
|
+ offset[0],
|
|
+ stream);
|
|
+ }
|
|
+ };
|
|
+ template <typename Tstream = std::ostream>
|
|
+ static void print_buffer_flat(const std::vector<hostbuf>& buf,
|
|
+ const std::vector<size_t>& size,
|
|
+ const std::vector<size_t>& offset,
|
|
+ Tstream& stream = std::cout)
|
|
+ {
|
|
+ for(const auto& vec : buf)
|
|
+ {
|
|
+ auto data = reinterpret_cast<const Telem*>(vec.data());
|
|
+ stream << "idx " << 0;
|
|
+ for(size_t i = 0; i < size[0]; ++i)
|
|
+ stream << " " << data[i];
|
|
+ stream << std::endl;
|
|
+ }
|
|
+ };
|
|
+};
|
|
+
|
|
+#endif
|
|
diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h
|
|
new file mode 100644
|
|
index 0000000..3bd15de
|
|
--- /dev/null
|
|
+++ b/shared/ptrdiff.h
|
|
@@ -0,0 +1,40 @@
|
|
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+
|
|
+// Compute the farthest point from the original pointer.
|
|
+static size_t compute_ptrdiff(const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& stride,
|
|
+ const size_t nbatch,
|
|
+ const size_t dist)
|
|
+{
|
|
+ size_t val = 0;
|
|
+ if(!length.empty())
|
|
+ {
|
|
+ val = 1;
|
|
+ for(unsigned int i = 0; i < length.size(); ++i)
|
|
+ {
|
|
+ val += (length[i] - 1) * stride[i];
|
|
+ }
|
|
+ val += (nbatch - 1) * dist;
|
|
+ }
|
|
+ return val;
|
|
+}
|
|
diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h
|
|
new file mode 100644
|
|
index 0000000..4ce3059
|
|
--- /dev/null
|
|
+++ b/shared/rocfft_accuracy_test.h
|
|
@@ -0,0 +1,29 @@
|
|
+// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_ACCURACY_TEST
|
|
+#define ROCFFT_ACCURACY_TEST
|
|
+
|
|
+#include "accuracy_test.h"
|
|
+#include "rocfft_params.h"
|
|
+
|
|
+void fft_vs_reference(rocfft_params& params, bool round_trip = false);
|
|
+
|
|
+#endif
|
|
diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h
|
|
new file mode 100644
|
|
index 0000000..d03754c
|
|
--- /dev/null
|
|
+++ b/shared/rocfft_against_fftw.h
|
|
@@ -0,0 +1,231 @@
|
|
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+#ifndef ROCFFT_AGAINST_FFTW
|
|
+#define ROCFFT_AGAINST_FFTW
|
|
+
|
|
+#include <gtest/gtest.h>
|
|
+#include <math.h>
|
|
+#include <stdexcept>
|
|
+#include <vector>
|
|
+
|
|
+#include "fftw_transform.h"
|
|
+
|
|
+// Return the precision enum for rocFFT based upon the type.
|
|
+template <typename Tfloat>
|
|
+inline fft_precision precision_selector();
|
|
+template <>
|
|
+inline fft_precision precision_selector<float>()
|
|
+{
|
|
+ return fft_precision_single;
|
|
+}
|
|
+template <>
|
|
+inline fft_precision precision_selector<double>()
|
|
+{
|
|
+ return fft_precision_double;
|
|
+}
|
|
+
|
|
+extern bool use_fftw_wisdom;
|
|
+
|
|
+// construct and return an FFTW plan with the specified type,
|
|
+// precision, and dimensions. cpu_out is required if we're using
|
|
+// wisdom, which runs actual FFTs to work out the best plan.
|
|
+template <typename Tfloat>
|
|
+static typename fftw_trait<Tfloat>::fftw_plan_type
|
|
+ fftw_plan_with_precision(const std::vector<fftw_iodim64>& dims,
|
|
+ const std::vector<fftw_iodim64>& howmany_dims,
|
|
+ const fft_transform_type transformType,
|
|
+ const size_t isize,
|
|
+ void* cpu_in,
|
|
+ void* cpu_out)
|
|
+{
|
|
+ using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type;
|
|
+
|
|
+ // NB: Using FFTW_MEASURE implies that the input buffer's data
|
|
+ // may be destroyed during plan creation. But if we're wanting
|
|
+ // to run FFTW in the first place, we must have just created an
|
|
+ // uninitialized input buffer anyway.
|
|
+
|
|
+ switch(transformType)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ return fftw_plan_guru64_dft<Tfloat>(dims.size(),
|
|
+ dims.data(),
|
|
+ howmany_dims.size(),
|
|
+ howmany_dims.data(),
|
|
+ reinterpret_cast<fftw_complex_type*>(cpu_in),
|
|
+ reinterpret_cast<fftw_complex_type*>(cpu_out),
|
|
+ -1,
|
|
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ return fftw_plan_guru64_dft<Tfloat>(dims.size(),
|
|
+ dims.data(),
|
|
+ howmany_dims.size(),
|
|
+ howmany_dims.data(),
|
|
+ reinterpret_cast<fftw_complex_type*>(cpu_in),
|
|
+ reinterpret_cast<fftw_complex_type*>(cpu_out),
|
|
+ 1,
|
|
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
|
|
+ case fft_transform_type_real_forward:
|
|
+ return fftw_plan_guru64_r2c<Tfloat>(dims.size(),
|
|
+ dims.data(),
|
|
+ howmany_dims.size(),
|
|
+ howmany_dims.data(),
|
|
+ reinterpret_cast<Tfloat*>(cpu_in),
|
|
+ reinterpret_cast<fftw_complex_type*>(cpu_out),
|
|
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
|
|
+ case fft_transform_type_real_inverse:
|
|
+ return fftw_plan_guru64_c2r<Tfloat>(dims.size(),
|
|
+ dims.data(),
|
|
+ howmany_dims.size(),
|
|
+ howmany_dims.data(),
|
|
+ reinterpret_cast<fftw_complex_type*>(cpu_in),
|
|
+ reinterpret_cast<Tfloat*>(cpu_out),
|
|
+ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+}
|
|
+
|
|
+// construct an FFTW plan, given rocFFT parameters. output is
|
|
+// required if planning with wisdom.
|
|
+template <typename Tfloat>
|
|
+static typename fftw_trait<Tfloat>::fftw_plan_type
|
|
+ fftw_plan_via_rocfft(const std::vector<size_t>& length,
|
|
+ const std::vector<size_t>& istride,
|
|
+ const std::vector<size_t>& ostride,
|
|
+ const size_t nbatch,
|
|
+ const size_t idist,
|
|
+ const size_t odist,
|
|
+ const fft_transform_type transformType,
|
|
+ std::vector<hostbuf>& input,
|
|
+ std::vector<hostbuf>& output)
|
|
+{
|
|
+ // Dimension configuration:
|
|
+ std::vector<fftw_iodim64> dims(length.size());
|
|
+ for(unsigned int idx = 0; idx < length.size(); ++idx)
|
|
+ {
|
|
+ dims[idx].n = length[idx];
|
|
+ dims[idx].is = istride[idx];
|
|
+ dims[idx].os = ostride[idx];
|
|
+ }
|
|
+
|
|
+ // Batch configuration:
|
|
+ std::vector<fftw_iodim64> howmany_dims(1);
|
|
+ howmany_dims[0].n = nbatch;
|
|
+ howmany_dims[0].is = idist;
|
|
+ howmany_dims[0].os = odist;
|
|
+
|
|
+ return fftw_plan_with_precision<Tfloat>(dims,
|
|
+ howmany_dims,
|
|
+ transformType,
|
|
+ idist * nbatch,
|
|
+ input.front().data(),
|
|
+ output.empty() ? nullptr : output.front().data());
|
|
+}
|
|
+
|
|
+template <typename Tfloat>
|
|
+void fftw_run(fft_transform_type transformType,
|
|
+ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan,
|
|
+ std::vector<hostbuf>& cpu_in,
|
|
+ std::vector<hostbuf>& cpu_out)
|
|
+{
|
|
+ switch(transformType)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ {
|
|
+ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
|
|
+ break;
|
|
+ }
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ {
|
|
+ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
|
|
+ break;
|
|
+ }
|
|
+ case fft_transform_type_real_forward:
|
|
+ {
|
|
+ fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
|
|
+ break;
|
|
+ }
|
|
+ case fft_transform_type_real_inverse:
|
|
+ {
|
|
+ fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+// Given a transform type, return the contiguous input type.
|
|
+inline fft_array_type contiguous_itype(const fft_transform_type transformType)
|
|
+{
|
|
+ switch(transformType)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ return fft_array_type_complex_interleaved;
|
|
+ case fft_transform_type_real_forward:
|
|
+ return fft_array_type_real;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ return fft_array_type_hermitian_interleaved;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+ return fft_array_type_complex_interleaved;
|
|
+}
|
|
+
|
|
+// Given a transform type, return the contiguous output type.
|
|
+inline fft_array_type contiguous_otype(const fft_transform_type transformType)
|
|
+{
|
|
+ switch(transformType)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ return fft_array_type_complex_interleaved;
|
|
+ case fft_transform_type_real_forward:
|
|
+ return fft_array_type_hermitian_interleaved;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ return fft_array_type_real;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+ return fft_array_type_complex_interleaved;
|
|
+}
|
|
+
|
|
+// Given a precision, return the acceptable tolerance.
|
|
+inline double type_epsilon(const fft_precision precision)
|
|
+{
|
|
+ switch(precision)
|
|
+ {
|
|
+ case fft_precision_half:
|
|
+ return type_epsilon<_Float16>();
|
|
+ break;
|
|
+ case fft_precision_single:
|
|
+ return type_epsilon<float>();
|
|
+ break;
|
|
+ case fft_precision_double:
|
|
+ return type_epsilon<double>();
|
|
+ break;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid precision");
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h
|
|
new file mode 100644
|
|
index 0000000..efa0290
|
|
--- /dev/null
|
|
+++ b/shared/rocfft_complex.h
|
|
@@ -0,0 +1,346 @@
|
|
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_COMPLEX_H
|
|
+#define ROCFFT_COMPLEX_H
|
|
+
|
|
+#include <hip/hip_fp16.h>
|
|
+#if !defined(__HIPCC_RTC__)
|
|
+#include <iostream>
|
|
+#endif
|
|
+#include <math.h>
|
|
+#include <type_traits>
|
|
+
|
|
+#ifdef __HIP_PLATFORM_NVIDIA__
|
|
+typedef __half _Float16;
|
|
+#endif
|
|
+
|
|
+template <typename Treal>
|
|
+struct rocfft_complex
|
|
+{
|
|
+
|
|
+ Treal x; // Real part
|
|
+ Treal y; // Imaginary part
|
|
+
|
|
+ // Constructors
|
|
+ // Do not initialize the members x or y by default, to ensure that it can
|
|
+ // be used in __shared__ and that it is a trivial class compatible with C.
|
|
+ __device__ __host__ rocfft_complex() = default;
|
|
+ __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
|
|
+ __device__ __host__ rocfft_complex(rocfft_complex&&) = default;
|
|
+ __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
|
|
+ __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
|
|
+ __device__ __host__ ~rocfft_complex() = default;
|
|
+
|
|
+ // Constructor from real and imaginary parts
|
|
+ __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
|
|
+ : x{real}
|
|
+ , y{imag}
|
|
+ {
|
|
+ }
|
|
+
|
|
+ // Conversion from different precision
|
|
+ template <typename U>
|
|
+ __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
|
|
+ : x(z.x)
|
|
+ , y(z.y)
|
|
+ {
|
|
+ }
|
|
+
|
|
+ // Accessors
|
|
+ __device__ __host__ constexpr Treal real() const
|
|
+ {
|
|
+ return x;
|
|
+ }
|
|
+
|
|
+ __device__ __host__ constexpr Treal imag() const
|
|
+ {
|
|
+ return y;
|
|
+ }
|
|
+
|
|
+ // Unary operations
|
|
+ __forceinline__ __device__ __host__ rocfft_complex operator-() const
|
|
+ {
|
|
+ return {-x, -y};
|
|
+ }
|
|
+
|
|
+ __forceinline__ __device__ __host__ rocfft_complex operator+() const
|
|
+ {
|
|
+ return *this;
|
|
+ }
|
|
+
|
|
+ __device__ __host__ Treal asum(const rocfft_complex& z)
|
|
+ {
|
|
+ return abs(z.x) + abs(z.y);
|
|
+ }
|
|
+
|
|
+ // Internal real functions
|
|
+ static __forceinline__ __device__ __host__ Treal abs(Treal x)
|
|
+ {
|
|
+ return x < 0 ? -x : x;
|
|
+ }
|
|
+
|
|
+ static __forceinline__ __device__ __host__ float sqrt(float x)
|
|
+ {
|
|
+ return ::sqrtf(x);
|
|
+ }
|
|
+
|
|
+ static __forceinline__ __device__ __host__ double sqrt(double x)
|
|
+ {
|
|
+ return ::sqrt(x);
|
|
+ }
|
|
+
|
|
+ // Addition operators
|
|
+ __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
|
|
+ {
|
|
+ return *this = {x + rhs.x, y + rhs.y};
|
|
+ }
|
|
+
|
|
+ __device__ __host__ auto operator+(const rocfft_complex& rhs) const
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs += rhs;
|
|
+ }
|
|
+
|
|
+ // Subtraction operators
|
|
+ __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
|
|
+ {
|
|
+ return *this = {x - rhs.x, y - rhs.y};
|
|
+ }
|
|
+
|
|
+ __device__ __host__ auto operator-(const rocfft_complex& rhs) const
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs -= rhs;
|
|
+ }
|
|
+
|
|
+ // Multiplication operators
|
|
+ __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
|
|
+ {
|
|
+ return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
|
|
+ }
|
|
+
|
|
+ __device__ __host__ auto operator*(const rocfft_complex& rhs) const
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs *= rhs;
|
|
+ }
|
|
+
|
|
+ // Division operators
|
|
+ __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
|
|
+ {
|
|
+ // Form of Robert L. Smith's Algorithm 116
|
|
+ if(abs(rhs.x) > abs(rhs.y))
|
|
+ {
|
|
+ Treal ratio = rhs.y / rhs.x;
|
|
+ Treal scale = 1 / (rhs.x + rhs.y * ratio);
|
|
+ *this = {(x + y * ratio) * scale, (y - x * ratio) * scale};
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ Treal ratio = rhs.x / rhs.y;
|
|
+ Treal scale = 1 / (rhs.x * ratio + rhs.y);
|
|
+ *this = {(y + x * ratio) * scale, (y * ratio - x) * scale};
|
|
+ }
|
|
+ return *this;
|
|
+ }
|
|
+
|
|
+ __device__ __host__ auto operator/(const rocfft_complex& rhs) const
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs /= rhs;
|
|
+ }
|
|
+
|
|
+ // Comparison operators
|
|
+ __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
|
|
+ {
|
|
+ return x == rhs.x && y == rhs.y;
|
|
+ }
|
|
+
|
|
+ __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
|
|
+ {
|
|
+ return !(*this == rhs);
|
|
+ }
|
|
+
|
|
+ // Operators for complex-real computations
|
|
+ template <typename U>
|
|
+ __device__ __host__ auto& operator+=(const U& rhs)
|
|
+ {
|
|
+ return (x += Treal(rhs)), *this;
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ auto& operator-=(const U& rhs)
|
|
+ {
|
|
+ return (x -= Treal(rhs)), *this;
|
|
+ }
|
|
+
|
|
+ __device__ __host__ auto operator+(const Treal& rhs)
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs += rhs;
|
|
+ }
|
|
+
|
|
+ __device__ __host__ auto operator-(const Treal& rhs)
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs -= rhs;
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ auto& operator*=(const U& rhs)
|
|
+ {
|
|
+ return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ auto operator*(const U& rhs) const
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs *= Treal(rhs);
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ auto& operator/=(const U& rhs)
|
|
+ {
|
|
+ return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ auto operator/(const U& rhs) const
|
|
+ {
|
|
+ auto lhs = *this;
|
|
+ return lhs /= Treal(rhs);
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ constexpr bool operator==(const U& rhs) const
|
|
+ {
|
|
+ return x == Treal(rhs) && y == 0;
|
|
+ }
|
|
+
|
|
+ template <typename U>
|
|
+ __device__ __host__ constexpr bool operator!=(const U& rhs) const
|
|
+ {
|
|
+ return !(*this == rhs);
|
|
+ }
|
|
+};
|
|
+
|
|
+// Stream operators
|
|
+#if !defined(__HIPCC_RTC__)
|
|
+static std::ostream& operator<<(std::ostream& stream, const _Float16& f)
|
|
+{
|
|
+ return stream << static_cast<double>(f);
|
|
+}
|
|
+
|
|
+template <typename Treal>
|
|
+std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
|
|
+{
|
|
+ return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
|
|
+}
|
|
+#endif
|
|
+
|
|
+// Operators for real-complex computations
|
|
+template <typename U, typename Treal>
|
|
+__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
|
|
+{
|
|
+ return {Treal(lhs) + rhs.x, rhs.y};
|
|
+}
|
|
+
|
|
+template <typename U, typename Treal>
|
|
+__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
|
|
+{
|
|
+ return {Treal(lhs) - rhs.x, -rhs.y};
|
|
+}
|
|
+
|
|
+template <typename U, typename Treal>
|
|
+__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
|
|
+{
|
|
+ return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
|
|
+}
|
|
+
|
|
+template <typename U, typename Treal>
|
|
+__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
|
|
+{
|
|
+ // Form of Robert L. Smith's Algorithm 116
|
|
+ if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
|
|
+ {
|
|
+ Treal ratio = rhs.y / rhs.x;
|
|
+ Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
|
|
+ return {scale, -scale * ratio};
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ Treal ratio = rhs.x / rhs.y;
|
|
+ Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
|
|
+ return {ratio * scale, -scale};
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename U, typename Treal>
|
|
+__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
|
|
+{
|
|
+ return Treal(lhs) == rhs.x && 0 == rhs.y;
|
|
+}
|
|
+
|
|
+template <typename U, typename Treal>
|
|
+__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
|
|
+{
|
|
+ return !(lhs == rhs);
|
|
+}
|
|
+
|
|
+// Extending std namespace to handle rocfft_complex datatype
|
|
+namespace std
|
|
+{
|
|
+ template <typename Treal>
|
|
+ __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
|
|
+ {
|
|
+ return z.x;
|
|
+ }
|
|
+
|
|
+ template <typename Treal>
|
|
+ __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
|
|
+ {
|
|
+ return z.y;
|
|
+ }
|
|
+
|
|
+ template <typename Treal>
|
|
+ __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
|
|
+ {
|
|
+ return {z.x, -z.y};
|
|
+ }
|
|
+
|
|
+ template <typename Treal>
|
|
+ __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
|
|
+ {
|
|
+ return (z.x * z.x) + (z.y * z.y);
|
|
+ }
|
|
+
|
|
+ template <typename Treal>
|
|
+ __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
|
|
+ {
|
|
+ Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
|
|
+ return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
|
|
+ : ti ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
|
|
+ : 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif // ROCFFT_COMPLEX_H
|
|
diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h
|
|
new file mode 100644
|
|
index 0000000..e086cab
|
|
--- /dev/null
|
|
+++ b/shared/rocfft_hip.h
|
|
@@ -0,0 +1,52 @@
|
|
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef __ROCFFT_HIP_H__
|
|
+#define __ROCFFT_HIP_H__
|
|
+
|
|
+#include <hip/hip_runtime_api.h>
|
|
+#include <stdexcept>
|
|
+
|
|
+class rocfft_scoped_device
|
|
+{
|
|
+public:
|
|
+ rocfft_scoped_device(int device)
|
|
+ {
|
|
+ if(hipGetDevice(&orig_device) != hipSuccess)
|
|
+ throw std::runtime_error("hipGetDevice failure");
|
|
+
|
|
+ if(hipSetDevice(device) != hipSuccess)
|
|
+ throw std::runtime_error("hipSetDevice failure");
|
|
+ }
|
|
+ ~rocfft_scoped_device()
|
|
+ {
|
|
+ (void)hipSetDevice(orig_device);
|
|
+ }
|
|
+
|
|
+ // not copyable or movable
|
|
+ rocfft_scoped_device(const rocfft_scoped_device&) = delete;
|
|
+ rocfft_scoped_device(rocfft_scoped_device&&) = delete;
|
|
+ rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete;
|
|
+
|
|
+private:
|
|
+ int orig_device;
|
|
+};
|
|
+
|
|
+#endif // __ROCFFT_HIP_H__
|
|
diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h
|
|
new file mode 100644
|
|
index 0000000..bf9b728
|
|
--- /dev/null
|
|
+++ b/shared/rocfft_params.h
|
|
@@ -0,0 +1,585 @@
|
|
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#ifndef ROCFFT_PARAMS_H
|
|
+#define ROCFFT_PARAMS_H
|
|
+
|
|
+#include "../shared/fft_params.h"
|
|
+#include "../shared/gpubuf.h"
|
|
+#include "rocfft/rocfft.h"
|
|
+
|
|
+// Return the string of the rocfft_status code
|
|
+static std::string rocfft_status_to_string(const rocfft_status ret)
|
|
+{
|
|
+ switch(ret)
|
|
+ {
|
|
+ case rocfft_status_success:
|
|
+ return "rocfft_status_success";
|
|
+ case rocfft_status_failure:
|
|
+ return "rocfft_status_failure";
|
|
+ case rocfft_status_invalid_arg_value:
|
|
+ return "rocfft_status_invalid_arg_value";
|
|
+ case rocfft_status_invalid_dimensions:
|
|
+ return "rocfft_status_invalid_dimensions";
|
|
+ case rocfft_status_invalid_array_type:
|
|
+ return "rocfft_status_invalid_array_type";
|
|
+ case rocfft_status_invalid_strides:
|
|
+ return "rocfft_status_invalid_strides";
|
|
+ case rocfft_status_invalid_distance:
|
|
+ return "rocfft_status_invalid_distance";
|
|
+ case rocfft_status_invalid_offset:
|
|
+ return "rocfft_status_invalid_offset";
|
|
+ case rocfft_status_invalid_work_buffer:
|
|
+ return "rocfft_status_invalid_work_buffer";
|
|
+ default:
|
|
+ throw std::runtime_error("unknown rocfft_status");
|
|
+ }
|
|
+}
|
|
+
|
|
+inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
|
|
+{
|
|
+ switch(val)
|
|
+ {
|
|
+ case rocfft_status_success:
|
|
+ return fft_status_success;
|
|
+ case rocfft_status_failure:
|
|
+ return fft_status_failure;
|
|
+ case rocfft_status_invalid_arg_value:
|
|
+ return fft_status_invalid_arg_value;
|
|
+ case rocfft_status_invalid_dimensions:
|
|
+ return fft_status_invalid_dimensions;
|
|
+ case rocfft_status_invalid_array_type:
|
|
+ return fft_status_invalid_array_type;
|
|
+ case rocfft_status_invalid_strides:
|
|
+ return fft_status_invalid_strides;
|
|
+ case rocfft_status_invalid_distance:
|
|
+ return fft_status_invalid_distance;
|
|
+ case rocfft_status_invalid_offset:
|
|
+ return fft_status_invalid_offset;
|
|
+ case rocfft_status_invalid_work_buffer:
|
|
+ return fft_status_invalid_work_buffer;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid status");
|
|
+ }
|
|
+}
|
|
+
|
|
+inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
|
|
+{
|
|
+ switch(val)
|
|
+ {
|
|
+ case fft_precision_single:
|
|
+ return rocfft_precision_single;
|
|
+ case fft_precision_double:
|
|
+ return rocfft_precision_double;
|
|
+ case fft_precision_half:
|
|
+ return rocfft_precision_half;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid precision");
|
|
+ }
|
|
+}
|
|
+
|
|
+inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
|
|
+{
|
|
+ switch(val)
|
|
+ {
|
|
+ case fft_array_type_complex_interleaved:
|
|
+ return rocfft_array_type_complex_interleaved;
|
|
+ case fft_array_type_complex_planar:
|
|
+ return rocfft_array_type_complex_planar;
|
|
+ case fft_array_type_real:
|
|
+ return rocfft_array_type_real;
|
|
+ case fft_array_type_hermitian_interleaved:
|
|
+ return rocfft_array_type_hermitian_interleaved;
|
|
+ case fft_array_type_hermitian_planar:
|
|
+ return rocfft_array_type_hermitian_planar;
|
|
+ case fft_array_type_unset:
|
|
+ return rocfft_array_type_unset;
|
|
+ }
|
|
+ return rocfft_array_type_unset;
|
|
+}
|
|
+
|
|
+inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
|
|
+{
|
|
+ switch(val)
|
|
+ {
|
|
+ case fft_transform_type_complex_forward:
|
|
+ return rocfft_transform_type_complex_forward;
|
|
+ case fft_transform_type_complex_inverse:
|
|
+ return rocfft_transform_type_complex_inverse;
|
|
+ case fft_transform_type_real_forward:
|
|
+ return rocfft_transform_type_real_forward;
|
|
+ case fft_transform_type_real_inverse:
|
|
+ return rocfft_transform_type_real_inverse;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid transform type");
|
|
+ }
|
|
+}
|
|
+
|
|
+inline rocfft_result_placement
|
|
+ rocfft_result_placement_from_fftparams(const fft_result_placement val)
|
|
+{
|
|
+ switch(val)
|
|
+ {
|
|
+ case fft_placement_inplace:
|
|
+ return rocfft_placement_inplace;
|
|
+ case fft_placement_notinplace:
|
|
+ return rocfft_placement_notinplace;
|
|
+ default:
|
|
+ throw std::runtime_error("Invalid result placement");
|
|
+ }
|
|
+}
|
|
+
|
|
+class rocfft_params : public fft_params
|
|
+{
|
|
+public:
|
|
+ rocfft_plan plan = nullptr;
|
|
+ rocfft_execution_info info = nullptr;
|
|
+ rocfft_plan_description desc = nullptr;
|
|
+ gpubuf_t<void> wbuffer;
|
|
+
|
|
+ explicit rocfft_params(){};
|
|
+
|
|
+ explicit rocfft_params(const fft_params& p)
|
|
+ : fft_params(p){};
|
|
+
|
|
+ rocfft_params(const rocfft_params&) = delete;
|
|
+ rocfft_params& operator=(const rocfft_params&) = delete;
|
|
+
|
|
+ ~rocfft_params()
|
|
+ {
|
|
+ free();
|
|
+ };
|
|
+
|
|
+ void free()
|
|
+ {
|
|
+ if(plan != nullptr)
|
|
+ {
|
|
+ rocfft_plan_destroy(plan);
|
|
+ plan = nullptr;
|
|
+ }
|
|
+ if(info != nullptr)
|
|
+ {
|
|
+ rocfft_execution_info_destroy(info);
|
|
+ info = nullptr;
|
|
+ }
|
|
+ if(desc != nullptr)
|
|
+ {
|
|
+ rocfft_plan_description_destroy(desc);
|
|
+ desc = nullptr;
|
|
+ }
|
|
+ wbuffer.free();
|
|
+ }
|
|
+
|
|
+ void validate_fields() const override
|
|
+ {
|
|
+ // row-major lengths including batch (i.e. batch is at the front)
|
|
+ std::vector<size_t> length_with_batch{nbatch};
|
|
+ std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch));
|
|
+
|
|
+ auto validate_field = [&](const fft_field& f) {
|
|
+ for(const auto& b : f.bricks)
|
|
+ {
|
|
+ // bricks must have same dim as FFT, including batch
|
|
+ if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1
|
|
+ || b.stride.size() != length.size() + 1)
|
|
+ throw std::runtime_error(
|
|
+ "brick dimension does not match FFT + batch dimension");
|
|
+
|
|
+ // ensure lower < upper, and that both fit in the FFT + batch dims
|
|
+ if(!std::lexicographical_compare(
|
|
+ b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end()))
|
|
+ throw std::runtime_error("brick lower index is not less than upper index");
|
|
+
|
|
+ if(!std::lexicographical_compare(b.lower.begin(),
|
|
+ b.lower.end(),
|
|
+ length_with_batch.begin(),
|
|
+ length_with_batch.end()))
|
|
+ throw std::runtime_error(
|
|
+ "brick lower index is not less than FFT + batch length");
|
|
+
|
|
+ if(!std::lexicographical_compare(b.upper.begin(),
|
|
+ b.upper.end(),
|
|
+ length_with_batch.begin(),
|
|
+ length_with_batch.end())
|
|
+ && b.upper != length_with_batch)
|
|
+ throw std::runtime_error("brick upper index is not <= FFT + batch length");
|
|
+ }
|
|
+ };
|
|
+
|
|
+ for(const auto& ifield : ifields)
|
|
+ validate_field(ifield);
|
|
+ for(const auto& ofield : ofields)
|
|
+ validate_field(ofield);
|
|
+ }
|
|
+
|
|
+ rocfft_precision get_rocfft_precision()
|
|
+ {
|
|
+ return rocfft_precision_from_fftparams(precision);
|
|
+ }
|
|
+
|
|
+ size_t vram_footprint() override
|
|
+ {
|
|
+ size_t val = fft_params::vram_footprint();
|
|
+ if(setup_structs() != fft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("Struct setup failed");
|
|
+ }
|
|
+ val += workbuffersize;
|
|
+
|
|
+ return val;
|
|
+ }
|
|
+
|
|
+ // Convert the generic fft_field structure to a rocfft_field
|
|
+ // structure that can be passed to rocFFT. In particular, we need
|
|
+ // to convert from row-major to column-major.
|
|
+ static rocfft_field fft_field_to_rocfft_field(const fft_field& f)
|
|
+ {
|
|
+ rocfft_field rfield = nullptr;
|
|
+ if(f.bricks.empty())
|
|
+ return rfield;
|
|
+
|
|
+ if(rocfft_field_create(&rfield) != rocfft_status_success)
|
|
+ throw std::runtime_error("rocfft_field_create failed");
|
|
+ for(const auto& b : f.bricks)
|
|
+ {
|
|
+ // rocFFT wants column-major bricks and fft_params stores
|
|
+ // row-major
|
|
+ std::vector<size_t> lower_cm;
|
|
+ std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm));
|
|
+ std::vector<size_t> upper_cm;
|
|
+ std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm));
|
|
+ std::vector<size_t> stride_cm;
|
|
+ std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm));
|
|
+
|
|
+ rocfft_brick rbrick = nullptr;
|
|
+ if(rocfft_brick_create(&rbrick,
|
|
+ lower_cm.data(), // field_lower
|
|
+ upper_cm.data(), // field_upper
|
|
+ stride_cm.data(), // brick_stride
|
|
+ lower_cm.size(), // dim
|
|
+ b.device) // deviceID
|
|
+ != rocfft_status_success)
|
|
+ throw std::runtime_error("rocfft_brick_create failed");
|
|
+
|
|
+ if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success)
|
|
+ throw std::runtime_error("rocfft_field_add_brick failed");
|
|
+
|
|
+ rocfft_brick_destroy(rbrick);
|
|
+ }
|
|
+ return rfield;
|
|
+ }
|
|
+
|
|
+ fft_status setup_structs()
|
|
+ {
|
|
+ rocfft_status fft_status = rocfft_status_success;
|
|
+ if(desc == nullptr)
|
|
+ {
|
|
+ rocfft_plan_description_create(&desc);
|
|
+ if(fft_status != rocfft_status_success)
|
|
+ return fft_status_from_rocfftparams(fft_status);
|
|
+
|
|
+ fft_status
|
|
+ = rocfft_plan_description_set_data_layout(desc,
|
|
+ rocfft_array_type_from_fftparams(itype),
|
|
+ rocfft_array_type_from_fftparams(otype),
|
|
+ ioffset.data(),
|
|
+ ooffset.data(),
|
|
+ istride_cm().size(),
|
|
+ istride_cm().data(),
|
|
+ idist,
|
|
+ ostride_cm().size(),
|
|
+ ostride_cm().data(),
|
|
+ odist);
|
|
+ if(fft_status != rocfft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
|
|
+ }
|
|
+
|
|
+ if(scale_factor != 1.0)
|
|
+ {
|
|
+ fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
|
|
+ if(fft_status != rocfft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for(const auto& ifield : ifields)
|
|
+ {
|
|
+ rocfft_field infield = fft_field_to_rocfft_field(ifield);
|
|
+ if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success)
|
|
+ throw std::runtime_error("rocfft_description_add_infield failed");
|
|
+ rocfft_field_destroy(infield);
|
|
+ }
|
|
+
|
|
+ for(const auto& ofield : ofields)
|
|
+ {
|
|
+ rocfft_field outfield = fft_field_to_rocfft_field(ofield);
|
|
+ if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success)
|
|
+ throw std::runtime_error("rocfft_description_add_outfield failed");
|
|
+ rocfft_field_destroy(outfield);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(plan == nullptr)
|
|
+ {
|
|
+ fft_status = rocfft_plan_create(&plan,
|
|
+ rocfft_result_placement_from_fftparams(placement),
|
|
+ rocfft_transform_type_from_fftparams(transform_type),
|
|
+ get_rocfft_precision(),
|
|
+ length_cm().size(),
|
|
+ length_cm().data(),
|
|
+ nbatch,
|
|
+ desc);
|
|
+ if(fft_status != rocfft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("rocfft_plan_create failed");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if(info == nullptr)
|
|
+ {
|
|
+ fft_status = rocfft_execution_info_create(&info);
|
|
+ if(fft_status != rocfft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("rocfft_execution_info_create failed");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
|
|
+ if(fft_status != rocfft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
|
|
+ }
|
|
+
|
|
+ return fft_status_from_rocfftparams(fft_status);
|
|
+ }
|
|
+
|
|
+ fft_status create_plan() override
|
|
+ {
|
|
+ fft_status ret = setup_structs();
|
|
+ if(ret != fft_status_success)
|
|
+ {
|
|
+ return ret;
|
|
+ }
|
|
+ if(workbuffersize > 0)
|
|
+ {
|
|
+ hipError_t hip_status = hipSuccess;
|
|
+ hip_status = wbuffer.alloc(workbuffersize);
|
|
+ if(hip_status != hipSuccess)
|
|
+ {
|
|
+ std::ostringstream oss;
|
|
+ oss << "work buffer allocation failed (" << workbuffersize << " requested)";
|
|
+ size_t mem_free = 0;
|
|
+ size_t mem_total = 0;
|
|
+ hip_status = hipMemGetInfo(&mem_free, &mem_total);
|
|
+ if(hip_status == hipSuccess)
|
|
+ {
|
|
+ oss << "free vram: " << mem_free << " total vram: " << mem_total;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ oss << "hipMemGetInfo also failed";
|
|
+ }
|
|
+ throw work_buffer_alloc_failure(oss.str());
|
|
+ }
|
|
+
|
|
+ auto rocret
|
|
+ = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
|
|
+ if(rocret != rocfft_status_success)
|
|
+ {
|
|
+ throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ fft_status set_callbacks(void* load_cb_host,
|
|
+ void* load_cb_data,
|
|
+ void* store_cb_host,
|
|
+ void* store_cb_data) override
|
|
+ {
|
|
+ if(run_callbacks)
|
|
+ {
|
|
+ auto roc_status
|
|
+ = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
|
|
+ if(roc_status != rocfft_status_success)
|
|
+ return fft_status_from_rocfftparams(roc_status);
|
|
+
|
|
+ roc_status
|
|
+ = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
|
|
+ if(roc_status != rocfft_status_success)
|
|
+ return fft_status_from_rocfftparams(roc_status);
|
|
+ }
|
|
+ return fft_status_success;
|
|
+ }
|
|
+
|
|
+ fft_status execute(void** in, void** out) override
|
|
+ {
|
|
+ auto ret = rocfft_execute(plan, in, out, info);
|
|
+ return fft_status_from_rocfftparams(ret);
|
|
+ }
|
|
+
|
|
+ // scatter data to multiple GPUs and adjust I/O buffers to match
|
|
+ void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
|
|
+ std::vector<void*>& pibuffer,
|
|
+ std::vector<void*>& pobuffer) override
|
|
+ {
|
|
+ auto alloc_fields = [&](const fft_params::fft_field& field,
|
|
+ fft_array_type array_type,
|
|
+ std::vector<void*>& pbuffer,
|
|
+ bool copy_input) {
|
|
+ if(field.bricks.empty())
|
|
+ return;
|
|
+
|
|
+ // we have a field defined, clear the list of buffers as
|
|
+ // we'll be allocating new ones for each brick
|
|
+ pbuffer.clear();
|
|
+
|
|
+ for(const auto& b : field.bricks)
|
|
+ {
|
|
+ // get brick's length - note that this includes batch
|
|
+ // dimension
|
|
+ const auto brick_len = b.length();
|
|
+ const auto brick_stride = b.stride;
|
|
+
|
|
+ const size_t brick_size_elems = product(brick_len.begin(), brick_len.end());
|
|
+ const size_t elem_size_bytes = var_size<size_t>(precision, array_type);
|
|
+ const size_t brick_size_bytes = brick_size_elems * elem_size_bytes;
|
|
+
|
|
+ // set device for the alloc, but we want to return to the
|
|
+ // default device as the source of a following memcpy
|
|
+ {
|
|
+ rocfft_scoped_device dev(b.device);
|
|
+ multi_gpu_data.emplace_back();
|
|
+ if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess)
|
|
+ throw std::runtime_error("device allocation failure");
|
|
+ pbuffer.push_back(multi_gpu_data.back().data());
|
|
+ }
|
|
+
|
|
+ if(copy_input)
|
|
+ {
|
|
+ // For now, assume we're only splitting on highest FFT
|
|
+ // dimension, lower-dimensional FFT data is all
|
|
+ // contiguous, and batches are contiguous in each brick.
|
|
+ //
|
|
+ // That means we can express this as a 2D memcpy.
|
|
+ const size_t unbatched_elems_per_brick
|
|
+ = product(brick_len.begin() + 1, brick_len.end());
|
|
+ const size_t unbatched_elems_per_fft = product(length.begin(), length.end());
|
|
+
|
|
+ // get this brick's starting offset in the field
|
|
+ const size_t brick_offset
|
|
+ = b.lower_field_offset(istride, idist) * elem_size_bytes;
|
|
+
|
|
+ // copy from original input - note that we're
|
|
+ // assuming interleaved data so ibuffer has only one
|
|
+ // gpubuf
|
|
+ if(hipMemcpy2D(pbuffer.back(),
|
|
+ unbatched_elems_per_brick * elem_size_bytes,
|
|
+ ibuffer.front().data_offset(brick_offset),
|
|
+ unbatched_elems_per_fft * elem_size_bytes,
|
|
+ unbatched_elems_per_brick * elem_size_bytes,
|
|
+ brick_len.front(),
|
|
+ hipMemcpyHostToDevice)
|
|
+ != hipSuccess)
|
|
+ throw std::runtime_error("hipMemcpy failure");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // if we copied the input to all the other devices, and
|
|
+ // this is an out-of-place transform, we no longer
|
|
+ // need the original input
|
|
+ if(copy_input && placement == fft_placement_notinplace)
|
|
+ ibuffer.clear();
|
|
+ };
|
|
+
|
|
+ // assume one input, one output field for simple cases
|
|
+ if(!ifields.empty())
|
|
+ alloc_fields(ifields.front(), itype, pibuffer, true);
|
|
+ if(!ofields.empty())
|
|
+ {
|
|
+ if(!ifields.empty() && placement == fft_placement_inplace)
|
|
+ pobuffer = pibuffer;
|
|
+ else
|
|
+ alloc_fields(ofields.front(), otype, pobuffer, false);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // when preparing for multi-GPU transform, we need to allocate data
|
|
+ // on each GPU. This vector remembers all of those allocations.
|
|
+ std::vector<gpubuf> multi_gpu_data;
|
|
+
|
|
+ // gather data after multi-GPU FFT for verification
|
|
+ void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) override
|
|
+ {
|
|
+ if(ofields.empty())
|
|
+ return;
|
|
+
|
|
+ for(size_t i = 0; i < ofields.front().bricks.size(); ++i)
|
|
+ {
|
|
+ const auto& b = ofields.front().bricks[i];
|
|
+ const auto& brick_ptr = pobuffer[i];
|
|
+
|
|
+ const auto brick_len = b.length();
|
|
+
|
|
+ const size_t elem_size_bytes = var_size<size_t>(precision, otype);
|
|
+
|
|
+ // get this brick's starting offset in the field
|
|
+ const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes;
|
|
+
|
|
+ // switch device to where we're copying from
|
|
+ rocfft_scoped_device dev(b.device);
|
|
+
|
|
+ // For now, assume we're only splitting on highest FFT
|
|
+ // dimension, lower-dimensional FFT data is all
|
|
+ // contiguous, and batches are contiguous in each brick.
|
|
+ //
|
|
+ // That means we can express this as a 2D memcpy.
|
|
+ const size_t unbatched_elems_per_brick
|
|
+ = product(brick_len.begin() + 1, brick_len.end());
|
|
+ const auto output_length = olength();
|
|
+ const size_t unbatched_elems_per_fft
|
|
+ = product(output_length.begin(), output_length.end());
|
|
+
|
|
+ // copy to original output buffer - note that
|
|
+ // we're assuming interleaved data so obuffer
|
|
+ // has only one gpubuf
|
|
+ if(hipMemcpy2D(obuffer.front().data_offset(brick_offset),
|
|
+ unbatched_elems_per_fft * elem_size_bytes,
|
|
+ brick_ptr,
|
|
+ unbatched_elems_per_brick * elem_size_bytes,
|
|
+ unbatched_elems_per_brick * elem_size_bytes,
|
|
+ brick_len.front(),
|
|
+ hipMemcpyDeviceToDevice)
|
|
+ != hipSuccess)
|
|
+ throw std::runtime_error("hipMemcpy failure");
|
|
+
|
|
+ // device-to-device transfers don't synchronize with the
|
|
+ // host, add explicit sync
|
|
+ (void)hipDeviceSynchronize();
|
|
+ }
|
|
+ pobuffer.clear();
|
|
+ pobuffer.push_back(obuffer.front().data());
|
|
+ }
|
|
+};
|
|
+
|
|
+#endif
|
|
diff --git a/shared/test_params.h b/shared/test_params.h
|
|
new file mode 100644
|
|
index 0000000..8d8f6f7
|
|
--- /dev/null
|
|
+++ b/shared/test_params.h
|
|
@@ -0,0 +1,51 @@
|
|
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+#ifndef TESTCONSTANTS_H
|
|
+#define TESTCONSTANTS_H
|
|
+
|
|
+#include <stdexcept>
|
|
+
|
|
+extern int verbose;
|
|
+extern size_t ramgb;
|
|
+extern size_t vramgb;
|
|
+
|
|
+extern size_t n_random_tests;
|
|
+
|
|
+extern size_t random_seed;
|
|
+extern double planar_prob;
|
|
+extern double callback_prob;
|
|
+
|
|
+extern double half_epsilon;
|
|
+extern double single_epsilon;
|
|
+extern double double_epsilon;
|
|
+extern bool skip_runtime_fails;
|
|
+
|
|
+extern double max_linf_eps_double;
|
|
+extern double max_l2_eps_double;
|
|
+extern double max_linf_eps_single;
|
|
+extern double max_l2_eps_single;
|
|
+extern double max_linf_eps_half;
|
|
+extern double max_l2_eps_half;
|
|
+
|
|
+extern int n_hip_failures;
|
|
+
|
|
+#endif
|
|
diff --git a/shared/work_queue.h b/shared/work_queue.h
|
|
new file mode 100644
|
|
index 0000000..e13fc41
|
|
--- /dev/null
|
|
+++ b/shared/work_queue.h
|
|
@@ -0,0 +1,49 @@
|
|
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
+//
|
|
+// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+// of this software and associated documentation files (the "Software"), to deal
|
|
+// in the Software without restriction, including without limitation the rights
|
|
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+// copies of the Software, and to permit persons to whom the Software is
|
|
+// furnished to do so, subject to the following conditions:
|
|
+//
|
|
+// The above copyright notice and this permission notice shall be included in
|
|
+// all copies or substantial portions of the Software.
|
|
+//
|
|
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+// THE SOFTWARE.
|
|
+
|
|
+#pragma once
|
|
+
|
|
+#include <condition_variable>
|
|
+#include <mutex>
|
|
+#include <queue>
|
|
+template <typename _WorkItem>
|
|
+struct WorkQueue
|
|
+{
|
|
+ void push(_WorkItem&& i)
|
|
+ {
|
|
+ std::unique_lock<std::mutex> lock(queueMutex);
|
|
+ items.emplace(std::move(i));
|
|
+ emptyWait.notify_all();
|
|
+ }
|
|
+ _WorkItem pop()
|
|
+ {
|
|
+ std::unique_lock<std::mutex> lock(queueMutex);
|
|
+ while(items.empty())
|
|
+ emptyWait.wait(lock);
|
|
+ _WorkItem item(items.front());
|
|
+ items.pop();
|
|
+ return item;
|
|
+ }
|
|
+
|
|
+private:
|
|
+ std::queue<_WorkItem> items;
|
|
+ std::mutex queueMutex;
|
|
+ std::condition_variable emptyWait;
|
|
+};
|