initial upload
This commit is contained in:
35
cuda/cuda_constants.cuh
Normal file
35
cuda/cuda_constants.cuh
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef CUDA_CONSTANTS_H
|
||||
#define CUDA_CONSTANTS_H
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#define CUSTOMREAL double // need here for cuda kernels
|
||||
#define MPI_CR MPI_DOUBLE
|
||||
//#define CUSTOMREAL float // need here for cuda kernels
|
||||
//#define MPI_CR MPI_FLOAT
|
||||
|
||||
#define MPI_DUMMY_TAG_CUDA 9999
|
||||
|
||||
// maximum grid dimension in one direction of GPU
|
||||
//#define MAXIMUM_GRID_DIM 65535
|
||||
|
||||
#define CUDA_MAX_BLOCK_SIZE 1024
|
||||
#define CUDA_MAX_GRID_SIZE 65535
|
||||
#define CUDA_MAX_THREADS_PER_BLOCK 1024
|
||||
|
||||
//#define CUDA_SWEEPING_BLOCK_SIZE 16 // s
|
||||
//#define CUDA_SWEEPING_BLOCK_SIZE 32 // 15.254 s
|
||||
//#define CUDA_SWEEPING_BLOCK_SIZE 64 // 15.281 s
|
||||
//#define CUDA_SWEEPING_BLOCK_SIZE 128 // 15.378 s
|
||||
//#define CUDA_SWEEPING_BLOCK_SIZE 256 // s
|
||||
#define CUDA_SWEEPING_BLOCK_SIZE 512 //
|
||||
//#define CUDA_SWEEPING_BLOCK_SIZE 1024 //
|
||||
|
||||
|
||||
#define CUDA_L1_BLOCK_SIZE 128
|
||||
//#define CUDA_L1_BLOCK_SIZE 256
|
||||
|
||||
#define CUDA_MAX_NUM_STREAMS 32
|
||||
|
||||
#endif // CUDA_CONSTANTS_H
|
||||
145
cuda/cuda_initialize.cuh
Normal file
145
cuda/cuda_initialize.cuh
Normal file
@@ -0,0 +1,145 @@
|
||||
#ifndef CUDA_INITIALIZE_H
|
||||
#define CUDA_INITIALIZE_H
|
||||
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
//#include "config.h"
|
||||
#include "cuda_constants.cuh"
|
||||
|
||||
void get_free_memory(double* free_db, double* used_db, double* total_db) {
|
||||
|
||||
// gets memory usage in byte
|
||||
size_t free_byte ;
|
||||
size_t total_byte ;
|
||||
cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
|
||||
if ( cudaSuccess != cuda_status ){
|
||||
printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
*free_db = (double)free_byte ;
|
||||
*total_db = (double)total_byte ;
|
||||
*used_db = *total_db - *free_db ;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// setup cuda constants and variables by reading device properties
|
||||
void initialize_cuda(){
|
||||
|
||||
std::cout << "Initializing CUDA..." << std::endl;
|
||||
|
||||
int ncuda_device;
|
||||
int device;
|
||||
|
||||
// count number of devices
|
||||
cudaGetDeviceCount(&ncuda_device);
|
||||
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("cudaGetDeviceCount returned error code %d after %d devices\n", err, ncuda_device);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (ncuda_device == 0)
|
||||
{
|
||||
printf("There is no device supporting CUDA\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// set the active device
|
||||
if (ncuda_device >= 1){
|
||||
cudaDeviceReset();
|
||||
|
||||
device = world_rank % ncuda_device;
|
||||
cudaSetDevice(device);
|
||||
|
||||
cudaFree(0);
|
||||
|
||||
// check device is set
|
||||
cudaGetDevice(&device);
|
||||
if (device != world_rank % ncuda_device){
|
||||
printf("Error: Could not set device to %d\n", world_rank % ncuda_device);
|
||||
exit(1);
|
||||
}
|
||||
} // end if ncuda_device >= 1
|
||||
|
||||
cudaGetDevice(&device);
|
||||
|
||||
// get device properties
|
||||
cudaDeviceProp deviceProp; // in cuda_constants
|
||||
cudaGetDeviceProperties(&deviceProp, device);
|
||||
|
||||
// exit if machine has no cuda enable device
|
||||
if (deviceProp.major == 9999 && deviceProp.minor == 9999){
|
||||
printf("Error: No CUDA device found\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// print device properties
|
||||
char filename[256];
|
||||
|
||||
if (world_rank == 0){
|
||||
sprintf(filename, "cuda_device_info.txt");
|
||||
FILE *fp = fopen(filename, "w");
|
||||
|
||||
if(fp == NULL){
|
||||
printf("Error: Could not open file %s\n", filename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// display device properties
|
||||
fprintf(fp,"Device Name = %s\n",deviceProp.name);
|
||||
fprintf(fp,"memory:\n");
|
||||
fprintf(fp," totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
|
||||
fprintf(fp," totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
|
||||
fprintf(fp," totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem);
|
||||
fprintf(fp," Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D);
|
||||
fprintf(fp," sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
|
||||
fprintf(fp," regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);
|
||||
fprintf(fp,"blocks:\n");
|
||||
fprintf(fp," Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
|
||||
fprintf(fp," Maximum size of each dimension of a block: %d x %d x %d\n",
|
||||
deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
|
||||
fprintf(fp," Maximum sizes of each dimension of a grid: %d x %d x %d\n",
|
||||
deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
|
||||
fprintf(fp,"features:\n");
|
||||
fprintf(fp," Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
|
||||
fprintf(fp," multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
|
||||
if (deviceProp.canMapHostMemory){
|
||||
fprintf(fp," canMapHostMemory: TRUE\n");
|
||||
}else{
|
||||
fprintf(fp," canMapHostMemory: FALSE\n");
|
||||
}
|
||||
if (deviceProp.deviceOverlap){
|
||||
fprintf(fp," deviceOverlap: TRUE\n");
|
||||
}else{
|
||||
fprintf(fp," deviceOverlap: FALSE\n");
|
||||
}
|
||||
if (deviceProp.concurrentKernels){
|
||||
fprintf(fp," concurrentKernels: TRUE\n");
|
||||
}else{
|
||||
fprintf(fp," concurrentKernels: FALSE\n");
|
||||
}
|
||||
// outputs initial memory infos via cudaMemGetInfo()
|
||||
double free_db,used_db,total_db;
|
||||
get_free_memory(&free_db,&used_db,&total_db);
|
||||
fprintf(fp,"memory usage:\n");
|
||||
fprintf(fp," rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
|
||||
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
|
||||
|
||||
// closes output file
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void finalize_cuda(){
|
||||
cudaDeviceReset();
|
||||
}
|
||||
|
||||
#endif // CUDA_INITIALIZE_H
|
||||
186
cuda/cuda_utils.cu
Normal file
186
cuda/cuda_utils.cu
Normal file
@@ -0,0 +1,186 @@
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
|
||||
|
||||
// allocate memory on device
|
||||
cudaError_t allocate_memory_on_device_i(void** d_ptr, size_t size)
|
||||
{
|
||||
return cudaMalloc((void**) d_ptr, size * sizeof(int));
|
||||
}
|
||||
|
||||
cudaError_t allocate_memory_on_device_cv(void** d_ptr, size_t size)
|
||||
{
|
||||
return cudaMalloc((void**) d_ptr, size * sizeof(CUSTOMREAL));
|
||||
}
|
||||
|
||||
cudaError_t allocate_memory_on_device_bl(void** d_ptr, size_t size)
|
||||
{
|
||||
return cudaMalloc((void**) d_ptr, size * sizeof(bool));
|
||||
}
|
||||
|
||||
|
||||
// device-host shared memory (pinned memory) (maybe unnecessary for CUDA-aware MPI)
|
||||
cudaError_t allocate_memory_on_device_cv_pinned(void** d_ptr, size_t size)
|
||||
{
|
||||
return cudaMallocHost((void**) d_ptr, size * sizeof(CUSTOMREAL));
|
||||
}
|
||||
|
||||
|
||||
// deallocate memory on device
|
||||
cudaError_t deallocate_memory_on_device_i(int*& d_ptr)
|
||||
{
|
||||
return cudaFree(d_ptr);
|
||||
}
|
||||
|
||||
cudaError_t deallocate_memory_on_device_cv(CUSTOMREAL*& d_ptr)
|
||||
{
|
||||
return cudaFree(d_ptr);
|
||||
}
|
||||
|
||||
cudaError_t deallocate_memory_on_device_bl(bool*& d_ptr)
|
||||
{
|
||||
return cudaFree(d_ptr);
|
||||
}
|
||||
|
||||
|
||||
// copy memory between host and device
|
||||
cudaError_t copy_host_to_device_i(int* d_ptr, int* h_ptr, const size_t size)
|
||||
{
|
||||
return cudaMemcpy(d_ptr, h_ptr, size * sizeof(int), cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
cudaError_t copy_host_to_device_cv(CUSTOMREAL* d_ptr, CUSTOMREAL* h_ptr, const size_t size)
|
||||
{
|
||||
return cudaMemcpy(d_ptr, h_ptr, size * sizeof(CUSTOMREAL), cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
cudaError_t copy_host_to_device_bl(bool* d_ptr, bool* h_ptr, const size_t size)
|
||||
{
|
||||
return cudaMemcpy(d_ptr, h_ptr, size * sizeof(bool), cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
// copy memory from device to host
|
||||
cudaError_t copy_device_to_host_i(int* h_ptr, int* d_ptr, size_t size)
|
||||
{
|
||||
return cudaMemcpy(h_ptr, d_ptr, size * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
cudaError_t copy_device_to_host_cv(CUSTOMREAL* h_ptr, CUSTOMREAL* d_ptr, size_t size)
|
||||
{
|
||||
return cudaMemcpy(h_ptr, d_ptr, size * sizeof(CUSTOMREAL), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
|
||||
// allocate and copy to device
|
||||
void* allocate_and_copy_host_to_device_i(int* h_ptr, size_t size, int num)
|
||||
{
|
||||
void* d_ptr;
|
||||
|
||||
print_CUDA_error_if_any(allocate_memory_on_device_i(&d_ptr, size), num);
|
||||
print_CUDA_error_if_any(copy_host_to_device_i((int*)d_ptr, h_ptr, size),num);
|
||||
|
||||
return d_ptr;
|
||||
}
|
||||
|
||||
void* allocate_and_copy_host_to_device_cv(CUSTOMREAL* h_ptr, size_t size, int num)
|
||||
{
|
||||
void* d_ptr;
|
||||
print_CUDA_error_if_any(allocate_memory_on_device_cv(&d_ptr, size),num);
|
||||
print_CUDA_error_if_any(copy_host_to_device_cv((CUSTOMREAL*)d_ptr, h_ptr, size), num);
|
||||
|
||||
return d_ptr;
|
||||
}
|
||||
|
||||
void* allocate_and_copy_host_to_device_bl(bool* h_ptr, size_t size, int num)
|
||||
{
|
||||
void* d_ptr;
|
||||
print_CUDA_error_if_any(allocate_memory_on_device_bl(&d_ptr, size),num);
|
||||
print_CUDA_error_if_any(copy_host_to_device_bl((bool*)d_ptr, h_ptr, size), num);
|
||||
|
||||
return d_ptr;
|
||||
}
|
||||
|
||||
// allocate, flatten and copy from host to device
|
||||
void flatten_arr_i(int* h_ptr_flattened, std::vector<int*>&h_v, int size_total, int* size_each)
|
||||
{
|
||||
// flatten
|
||||
int counter = 0;
|
||||
int n_v = h_v.size();
|
||||
|
||||
for (int i = 0; i < n_v; i++) { // levels
|
||||
for (int j = 0; j < size_each[i]; j++) {
|
||||
h_ptr_flattened[counter] = h_v.at(i)[j];
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void flatten_arr_cv(CUSTOMREAL* h_ptr_flattened, std::vector<CUSTOMREAL*> &h_v, int size_total, int* size_each)
|
||||
{
|
||||
// flatten
|
||||
int counter = 0;
|
||||
int n_v = h_v.size();
|
||||
|
||||
for (int i = 0; i < n_v; i++) { // levels
|
||||
for (int j = 0; j < size_each[i]; j++) {
|
||||
h_ptr_flattened[counter] = h_v.at(i)[j];
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void flatten_arr_bl(bool* h_ptr_flattened, std::vector<bool*> &h_v, int size_total, int* size_each)
|
||||
{
|
||||
// flatten
|
||||
int counter = 0;
|
||||
int n_v = h_v.size();
|
||||
|
||||
for (int i = 0; i < n_v; i++) { // levels
|
||||
for (int j = 0; j < size_each[i]; j++) {
|
||||
h_ptr_flattened[counter] = h_v.at(i)[j];
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* allocate_and_copy_host_to_device_flattened_i(std::vector<int*>& vh, int size_total, int* size_each, int num){
|
||||
// flatten
|
||||
int* h_ptr_flattened = new int[size_total];
|
||||
flatten_arr_i(h_ptr_flattened, vh, size_total, size_each);
|
||||
|
||||
// allocate and copy
|
||||
void* d_ptr = allocate_and_copy_host_to_device_i(h_ptr_flattened, size_total, num);
|
||||
|
||||
// free
|
||||
delete[] h_ptr_flattened;
|
||||
|
||||
return d_ptr;
|
||||
}
|
||||
|
||||
void* allocate_and_copy_host_to_device_flattened_cv(std::vector<CUSTOMREAL*>& vh, int size_total, int* size_each, int num){
|
||||
// flatten
|
||||
CUSTOMREAL* h_ptr_flattened = new CUSTOMREAL[size_total];
|
||||
flatten_arr_cv(h_ptr_flattened, vh, size_total, size_each);
|
||||
|
||||
// allocate and copy
|
||||
void* d_ptr = allocate_and_copy_host_to_device_cv(h_ptr_flattened, size_total, num);
|
||||
|
||||
// free
|
||||
delete[] h_ptr_flattened;
|
||||
|
||||
return d_ptr;
|
||||
}
|
||||
|
||||
void* allocate_and_copy_host_to_device_flattened_bl(std::vector<bool*>& vh, int size_total, int* size_each, int num){
|
||||
// flatten
|
||||
bool* h_ptr_flattened = new bool[size_total];
|
||||
flatten_arr_bl(h_ptr_flattened, vh, size_total, size_each);
|
||||
|
||||
// allocate and copy
|
||||
void* d_ptr = allocate_and_copy_host_to_device_bl(h_ptr_flattened, size_total, num);
|
||||
|
||||
// free
|
||||
delete[] h_ptr_flattened;
|
||||
|
||||
return d_ptr;
|
||||
}
|
||||
|
||||
211
cuda/cuda_utils.cuh
Normal file
211
cuda/cuda_utils.cuh
Normal file
@@ -0,0 +1,211 @@
|
||||
#ifndef CUDA_UTILS_H
|
||||
#define CUDA_UTILS_H
|
||||
|
||||
#include <mpi.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
#include "cuda_constants.cuh"
|
||||
|
||||
// function to convert kernel,grid to ijk
|
||||
#define I2V_cuda(i,j,k,II,JJ) ((k)*(JJ)*(II)+(j)*(II)+(i))
|
||||
|
||||
|
||||
// allocate memory on device
|
||||
cudaError_t allocate_memory_on_device_i(void** d_ptr, size_t size);
|
||||
cudaError_t allocate_memory_on_device_cv(void** d_ptr, size_t size);
|
||||
cudaError_t allocate_memory_on_device_bl(void** d_ptr, size_t size);
|
||||
cudaError_t allocate_memory_on_device_cv_pinned(void** d_ptr, size_t size);
|
||||
|
||||
// deallocate memory on device
|
||||
cudaError_t deallocate_memory_on_device_i(int *&d_ptr);
|
||||
cudaError_t deallocate_memory_on_device_cv(CUSTOMREAL *&d_ptr);
|
||||
cudaError_t deallocate_memory_on_device_bl(bool *&d_ptr);
|
||||
|
||||
// copy memory from host to device
|
||||
cudaError_t copy_host_to_device_i(int *d_ptr, int *h_ptr, const size_t size);
|
||||
cudaError_t copy_host_to_device_cv(CUSTOMREAL *d_ptr, CUSTOMREAL *h_ptr, const size_t size);
|
||||
cudaError_t copy_host_to_device_bl(bool *d_ptr, bool *h_ptr, const size_t size);
|
||||
|
||||
// copy memory from device to host
|
||||
cudaError_t copy_device_to_host_i(int *h_ptr, int *d_ptr, size_t size);
|
||||
cudaError_t copy_device_to_host_cv(CUSTOMREAL *h_ptr, CUSTOMREAL *d_ptr, size_t size);
|
||||
|
||||
// allocate and copy to device
|
||||
void* allocate_and_copy_host_to_device_i(int* h_ptr, size_t size, int num);
|
||||
void* allocate_and_copy_host_to_device_cv(CUSTOMREAL* h_ptr, size_t size, int num);
|
||||
void* allocate_and_copy_host_to_device_bl(bool* h_ptr, size_t size, int num);
|
||||
|
||||
|
||||
// allocate, flatten and copy to device
|
||||
void flatten_arr_i(int* h_ptr_flattened, std::vector<int*> &h_v, int size_total, int* size_each);
|
||||
void flatten_arr_cv(CUSTOMREAL* h_ptr_flattened, std::vector<CUSTOMREAL*> &h_v, int size_total, int* size_each);
|
||||
void flatten_arr_bl(bool* h_ptr_flattened, std::vector<bool*> &h_v, int size_total, int* size_each);
|
||||
|
||||
|
||||
void* allocate_and_copy_host_to_device_flattened_i(std::vector<int*>&vh, int size_total, int* size_each, int num);
|
||||
void* allocate_and_copy_host_to_device_flattened_cv(std::vector<CUSTOMREAL*>& vh, int size_total, int* size_each, int num);
|
||||
void* allocate_and_copy_host_to_device_flattened_bl(std::vector<bool*>& vh, int size_total, int* size_each, int num);
|
||||
|
||||
|
||||
// mpi send recv
|
||||
static inline void cuda_send_cr(CUSTOMREAL* buf, int count, int dest, MPI_Comm inter_sub_comm){
|
||||
MPI_Send(buf, count, MPI_CR, dest, MPI_DUMMY_TAG_CUDA, inter_sub_comm);
|
||||
}
|
||||
|
||||
static inline void cuda_recv_cr(CUSTOMREAL* buf, int count, int source, MPI_Comm inter_sub_comm){
|
||||
MPI_Status stat;
|
||||
MPI_Recv(buf, count, MPI_CR, source, MPI_DUMMY_TAG_CUDA, inter_sub_comm, &stat);
|
||||
}
|
||||
|
||||
static inline void cuda_synchronize_all_sub(MPI_Comm& sub_comm){
|
||||
MPI_Barrier(sub_comm);
|
||||
}
|
||||
|
||||
inline void cuda_wait_req(MPI_Request& req){
|
||||
MPI_Status status;
|
||||
MPI_Wait(&req, &status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline void get_block_xy(int num_blocks, int* num_blocks_x, int* num_blocks_y) {
|
||||
// at first, the num_blocks_x is set with equal value of num_blocks, and num_blocks_y is set with value 1
|
||||
// when the num_blocks_x exceeds the block size limit of 65535, the num_blocks_x is divided by 2 and num_blocks_y is increased by 1
|
||||
*num_blocks_x = num_blocks;
|
||||
*num_blocks_y = 1;
|
||||
|
||||
while (*num_blocks_x > CUDA_MAX_GRID_SIZE) {
|
||||
*num_blocks_x = (int) ceil(*num_blocks_x * 0.5f);
|
||||
*num_blocks_y = *num_blocks_y * 2;;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static inline void get_thread_block_for_3d_loop(int nx, int ny, int nz, dim3* threads, dim3* blocks) {
|
||||
threads->x = 8; threads->y = 8; threads->z = 8; // use 512 threads in total
|
||||
blocks->x = (nx + threads->x - 1)/threads->x;
|
||||
blocks->y = (ny + threads->y - 1)/threads->y;
|
||||
blocks->z = (nz + threads->z - 1)/threads->z;
|
||||
}
|
||||
|
||||
|
||||
static inline void get_thread_block_for_ibound(int nx, int ny, int nz, dim3* threads, dim3* blocks) {
|
||||
threads->x = nx; threads->y = 8; threads->z = 8;
|
||||
blocks->x = (nx + threads->x - 1)/threads->x;
|
||||
blocks->y = (ny + threads->y - 1)/threads->y;
|
||||
blocks->z = (nz + threads->z - 1)/threads->z;
|
||||
}
|
||||
|
||||
|
||||
static inline void get_thread_block_for_jbound(int nx, int ny, int nz, dim3* threads, dim3* blocks) {
|
||||
threads->x = 8; threads->y = ny; threads->z = 8;
|
||||
blocks->x = (nx + threads->x - 1)/threads->x;
|
||||
blocks->y = (ny + threads->y - 1)/threads->y;
|
||||
blocks->z = (nz + threads->z - 1)/threads->z;
|
||||
}
|
||||
|
||||
|
||||
static inline void get_thread_block_for_kbound(int nx, int ny, int nz, dim3* threads, dim3* blocks) {
|
||||
threads->x = 8; threads->y = 8; threads->z = nz;
|
||||
blocks->x = (nx + threads->x - 1)/threads->x;
|
||||
blocks->y = (ny + threads->y - 1)/threads->y;
|
||||
blocks->z = (nz + threads->z - 1)/threads->z;
|
||||
}
|
||||
|
||||
|
||||
inline void cuda_isend_cr(CUSTOMREAL* buf, int count, int dest, MPI_Comm& comm, MPI_Request& request){
|
||||
//MPI_Request request = MPI_REQUEST_NULL;
|
||||
//std::cout << "sending from : " << inter_sub_rank << ", to : " << dest <<", size : " << count << std::endl;
|
||||
int DUMMY_TAG = 9999;
|
||||
MPI_Isend(buf, count, MPI_CR, dest, DUMMY_TAG, comm, &request);
|
||||
}
|
||||
|
||||
inline void cuda_irecv_cr(CUSTOMREAL* buf, int count, int source, MPI_Comm& comm, MPI_Request& request){
|
||||
//MPI_Request request = MPI_REQUEST_NULL;
|
||||
//std::cout << "receiving by : " << inter_sub_rank << ", from : " << source << ", size : " << count << std::endl;
|
||||
int DUMMY_TAG = 9999;
|
||||
MPI_Irecv(buf, count, MPI_CR, source, DUMMY_TAG, comm, &request);
|
||||
}
|
||||
|
||||
|
||||
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
|
||||
{
|
||||
if (code != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
||||
if (abort) exit(code);
|
||||
}
|
||||
}
|
||||
|
||||
inline void print_memory_usage(){
|
||||
size_t free_byte ;
|
||||
size_t total_byte ;
|
||||
cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
|
||||
if ( cudaSuccess != cuda_status ){
|
||||
printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
|
||||
exit(1);
|
||||
}
|
||||
|
||||
double free_db = (double)free_byte ;
|
||||
double total_db = (double)total_byte ;
|
||||
double used_db = total_db - free_db ;
|
||||
|
||||
printf("GPU memory usage: used = %f, free = %f MB, total = %f MB\n",
|
||||
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
|
||||
}
|
||||
|
||||
inline void print_CUDA_error_if_any(cudaError_t err, int num) {
|
||||
if (cudaSuccess != err)
|
||||
{
|
||||
printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
|
||||
fflush(stdout);
|
||||
|
||||
// outputs error file
|
||||
FILE* fp;
|
||||
int myrank;
|
||||
char filename[BUFSIZ];
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
sprintf(filename,"error_message_%06d.txt",myrank);
|
||||
fp = fopen(filename,"a+");
|
||||
if (fp != NULL) {
|
||||
fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
// check memory usage
|
||||
size_t free_byte ;
|
||||
size_t total_byte ;
|
||||
cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
|
||||
|
||||
if ( cudaSuccess != cuda_status ){
|
||||
printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
|
||||
fflush(stdout);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// print usage
|
||||
double free_db = (double)free_byte ;
|
||||
double total_db = (double)total_byte ;
|
||||
double used_db = total_db - free_db ;
|
||||
printf("GPU memory usage: used = %f, free = %f MB, total = %f MB", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
|
||||
|
||||
|
||||
// stops program
|
||||
//MPI_Abort(MPI_COMM_WORLD,1);
|
||||
MPI_Finalize();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif // CUDA_UTILS_H
|
||||
681
cuda/grid_wrapper.cu
Normal file
681
cuda/grid_wrapper.cu
Normal file
@@ -0,0 +1,681 @@
|
||||
#include "grid_wrapper.cuh"
|
||||
|
||||
void cuda_initialize_grid_1st(std::vector< std::vector<int> >& ijk, Grid_on_device* grid_dv, int const& loc_I, int const& loc_J, int const& loc_K,
|
||||
CUSTOMREAL const& dp, CUSTOMREAL const& dt, CUSTOMREAL const& dr, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_ip1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_im1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jp1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jm1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__kp1, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__km1, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_a, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_b, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_c, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_f, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0v, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0r, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0t, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0p, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fun, \
|
||||
std::vector<std::vector<bool*>> & vv_change){
|
||||
|
||||
// store grid parameters
|
||||
grid_dv->loc_I_host = loc_I;
|
||||
grid_dv->loc_J_host = loc_J;
|
||||
grid_dv->loc_K_host = loc_K;
|
||||
grid_dv->dr_host = dr;
|
||||
grid_dv->dt_host = dt;
|
||||
grid_dv->dp_host = dp;
|
||||
|
||||
// count node number
|
||||
grid_dv->n_nodes_total_host = 0;
|
||||
grid_dv->n_levels_host = ijk.size();
|
||||
// allocate grid_dv->n_nodes_on_levels_host
|
||||
grid_dv->n_nodes_on_levels_host = new int[grid_dv->n_levels_host];
|
||||
|
||||
for (int i=0; i<grid_dv->n_levels_host; i++){
|
||||
grid_dv->n_nodes_on_levels_host[i] = ijk[i].size();
|
||||
grid_dv->n_nodes_total_host += grid_dv->n_nodes_on_levels_host[i];
|
||||
// find max
|
||||
if (grid_dv->n_nodes_on_levels_host[i] > grid_dv->n_nodes_max_host){
|
||||
grid_dv->n_nodes_max_host = grid_dv->n_nodes_on_levels_host[i];
|
||||
}
|
||||
}
|
||||
|
||||
// allocate memory on device
|
||||
grid_dv->n_nodes_on_levels = (int*) allocate_and_copy_host_to_device_i(grid_dv->n_nodes_on_levels_host, grid_dv->n_levels_host, 0);
|
||||
|
||||
grid_dv->vv_i__j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 2);
|
||||
grid_dv->vv_ip1j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 2);
|
||||
grid_dv->vv_im1j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 3);
|
||||
grid_dv->vv_i__jp1k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 4);
|
||||
grid_dv->vv_i__jm1k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 5);
|
||||
grid_dv->vv_i__j__kp1_0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 6);
|
||||
grid_dv->vv_i__j__km1_0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 7);
|
||||
|
||||
grid_dv->vv_i__j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_ip1j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_im1j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jp1k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__jm1k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__kp1_1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
grid_dv->vv_i__j__km1_1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 14);
|
||||
|
||||
grid_dv->vv_i__j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 15);
|
||||
grid_dv->vv_ip1j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 16);
|
||||
grid_dv->vv_im1j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 17);
|
||||
grid_dv->vv_i__jp1k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 18);
|
||||
grid_dv->vv_i__jm1k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 19);
|
||||
grid_dv->vv_i__j__kp1_2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 20);
|
||||
grid_dv->vv_i__j__km1_2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 21);
|
||||
|
||||
grid_dv->vv_i__j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 22);
|
||||
grid_dv->vv_ip1j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 23);
|
||||
grid_dv->vv_im1j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 24);
|
||||
grid_dv->vv_i__jp1k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 25);
|
||||
grid_dv->vv_i__jm1k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 26);
|
||||
grid_dv->vv_i__j__kp1_3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 27);
|
||||
grid_dv->vv_i__j__km1_3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 28);
|
||||
|
||||
grid_dv->vv_i__j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 29);
|
||||
grid_dv->vv_ip1j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 30);
|
||||
grid_dv->vv_im1j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 31);
|
||||
grid_dv->vv_i__jp1k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 32);
|
||||
grid_dv->vv_i__jm1k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 33);
|
||||
grid_dv->vv_i__j__kp1_4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 34);
|
||||
grid_dv->vv_i__j__km1_4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 35);
|
||||
|
||||
grid_dv->vv_i__j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 36);
|
||||
grid_dv->vv_ip1j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 37);
|
||||
grid_dv->vv_im1j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 38);
|
||||
grid_dv->vv_i__jp1k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 39);
|
||||
grid_dv->vv_i__jm1k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 40);
|
||||
grid_dv->vv_i__j__kp1_5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 41);
|
||||
grid_dv->vv_i__j__km1_5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 42);
|
||||
|
||||
grid_dv->vv_i__j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 43);
|
||||
grid_dv->vv_ip1j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 44);
|
||||
grid_dv->vv_im1j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 45);
|
||||
grid_dv->vv_i__jp1k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 46);
|
||||
grid_dv->vv_i__jm1k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 47);
|
||||
grid_dv->vv_i__j__kp1_6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 48);
|
||||
grid_dv->vv_i__j__km1_6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 49);
|
||||
|
||||
grid_dv->vv_i__j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 50);
|
||||
grid_dv->vv_ip1j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 51);
|
||||
grid_dv->vv_im1j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 52);
|
||||
grid_dv->vv_i__jp1k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 53);
|
||||
grid_dv->vv_i__jm1k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 54);
|
||||
grid_dv->vv_i__j__kp1_7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 55);
|
||||
grid_dv->vv_i__j__km1_7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 56);
|
||||
|
||||
grid_dv->vv_fac_a_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 57);
|
||||
grid_dv->vv_fac_b_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 58);
|
||||
grid_dv->vv_fac_c_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 59);
|
||||
grid_dv->vv_fac_f_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 60);
|
||||
grid_dv->vv_T0v_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 61);
|
||||
grid_dv->vv_T0r_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 62);
|
||||
grid_dv->vv_T0t_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 63);
|
||||
grid_dv->vv_T0p_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 64);
|
||||
grid_dv->vv_fun_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 65);
|
||||
grid_dv->vv_change_0 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 66);
|
||||
|
||||
grid_dv->vv_fac_a_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 67);
|
||||
grid_dv->vv_fac_b_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 68);
|
||||
grid_dv->vv_fac_c_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 69);
|
||||
grid_dv->vv_fac_f_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 70);
|
||||
grid_dv->vv_T0v_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 71);
|
||||
grid_dv->vv_T0r_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 72);
|
||||
grid_dv->vv_T0t_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 73);
|
||||
grid_dv->vv_T0p_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 74);
|
||||
grid_dv->vv_fun_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 75);
|
||||
grid_dv->vv_change_1 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 76);
|
||||
|
||||
grid_dv->vv_fac_a_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_2 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_3 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_4 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_5 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_6 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_7 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
// allocate tau (need full grid including boundary nodes)
|
||||
print_CUDA_error_if_any(allocate_memory_on_device_cv((void**)&(grid_dv->tau), loc_I*loc_J*loc_K), 87);
|
||||
|
||||
|
||||
}
|
||||
|
||||
void cuda_initialize_grid_3rd(std::vector< std::vector<int> >& ijk, Grid_on_device* grid_dv, int const& loc_I, int const& loc_J, int const& loc_K,
|
||||
CUSTOMREAL const& dp, CUSTOMREAL const& dt, CUSTOMREAL const& dr, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_ip1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_im1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jp1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jm1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__kp1, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__km1, \
|
||||
std::vector<std::vector<int*>> & vv_ip2j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_im2j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jp2k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jm2k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__kp2, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__km2, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_a, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_b, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_c, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_f, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0v, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0r, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0t, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0p, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fun, \
|
||||
std::vector<std::vector<bool*>> & vv_change){
|
||||
|
||||
grid_dv->if_3rd_order = true;
|
||||
|
||||
// store grid parameters
|
||||
grid_dv->loc_I_host = loc_I;
|
||||
grid_dv->loc_J_host = loc_J;
|
||||
grid_dv->loc_K_host = loc_K;
|
||||
grid_dv->dr_host = dr;
|
||||
grid_dv->dt_host = dt;
|
||||
grid_dv->dp_host = dp;
|
||||
|
||||
// count node number
|
||||
grid_dv->n_nodes_total_host = 0;
|
||||
grid_dv->n_levels_host = ijk.size();
|
||||
grid_dv->n_nodes_on_levels_host = new int[grid_dv->n_levels_host];
|
||||
|
||||
for (int i = 0; i < grid_dv->n_levels_host; i++){
|
||||
grid_dv->n_nodes_on_levels_host[i] = ijk.at(i).size();
|
||||
grid_dv->n_nodes_total_host += grid_dv->n_nodes_on_levels_host[i];
|
||||
// find max
|
||||
if (grid_dv->n_nodes_on_levels_host[i] > grid_dv->n_nodes_max_host){
|
||||
grid_dv->n_nodes_max_host = grid_dv->n_nodes_on_levels_host[i];
|
||||
}
|
||||
}
|
||||
|
||||
// allocate memory on device
|
||||
grid_dv->n_nodes_on_levels = (int*) allocate_and_copy_host_to_device_i(grid_dv->n_nodes_on_levels_host, grid_dv->n_levels_host, 0);
|
||||
|
||||
grid_dv->vv_i__j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 1);
|
||||
grid_dv->vv_ip1j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 2);
|
||||
grid_dv->vv_im1j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 3);
|
||||
grid_dv->vv_i__jp1k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 4);
|
||||
grid_dv->vv_i__jm1k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 5);
|
||||
grid_dv->vv_i__j__kp1_0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 6);
|
||||
grid_dv->vv_i__j__km1_0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 7);
|
||||
grid_dv->vv_ip2j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_0 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_ip1j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_im1j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jp1k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__jm1k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__kp1_1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
grid_dv->vv_i__j__km1_1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 14);
|
||||
grid_dv->vv_ip2j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_1 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 15);
|
||||
grid_dv->vv_ip1j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 16);
|
||||
grid_dv->vv_im1j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 17);
|
||||
grid_dv->vv_i__jp1k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 18);
|
||||
grid_dv->vv_i__jm1k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 19);
|
||||
grid_dv->vv_i__j__kp1_2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 20);
|
||||
grid_dv->vv_i__j__km1_2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 21);
|
||||
grid_dv->vv_ip2j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_2 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 22);
|
||||
grid_dv->vv_ip1j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 23);
|
||||
grid_dv->vv_im1j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 24);
|
||||
grid_dv->vv_i__jp1k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 25);
|
||||
grid_dv->vv_i__jm1k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 26);
|
||||
grid_dv->vv_i__j__kp1_3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 27);
|
||||
grid_dv->vv_i__j__km1_3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 28);
|
||||
grid_dv->vv_ip2j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_3 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 29);
|
||||
grid_dv->vv_ip1j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 30);
|
||||
grid_dv->vv_im1j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 31);
|
||||
grid_dv->vv_i__jp1k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 32);
|
||||
grid_dv->vv_i__jm1k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 33);
|
||||
grid_dv->vv_i__j__kp1_4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 34);
|
||||
grid_dv->vv_i__j__km1_4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 35);
|
||||
grid_dv->vv_ip2j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_4 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 36);
|
||||
grid_dv->vv_ip1j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 37);
|
||||
grid_dv->vv_im1j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 38);
|
||||
grid_dv->vv_i__jp1k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 39);
|
||||
grid_dv->vv_i__jm1k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 40);
|
||||
grid_dv->vv_i__j__kp1_5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 41);
|
||||
grid_dv->vv_i__j__km1_5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 42);
|
||||
grid_dv->vv_ip2j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_5 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 43);
|
||||
grid_dv->vv_ip1j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 44);
|
||||
grid_dv->vv_im1j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 45);
|
||||
grid_dv->vv_i__jp1k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 46);
|
||||
grid_dv->vv_i__jm1k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 47);
|
||||
grid_dv->vv_i__j__kp1_6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 48);
|
||||
grid_dv->vv_i__j__km1_6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 49);
|
||||
grid_dv->vv_ip2j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_6 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_i__j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 50);
|
||||
grid_dv->vv_ip1j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip1j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 51);
|
||||
grid_dv->vv_im1j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im1j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 52);
|
||||
grid_dv->vv_i__jp1k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp1k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 53);
|
||||
grid_dv->vv_i__jm1k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm1k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 54);
|
||||
grid_dv->vv_i__j__kp1_7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp1.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 55);
|
||||
grid_dv->vv_i__j__km1_7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km1.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 56);
|
||||
grid_dv->vv_ip2j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_ip2j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 8);
|
||||
grid_dv->vv_im2j__k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_im2j__k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 9);
|
||||
grid_dv->vv_i__jp2k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jp2k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 10);
|
||||
grid_dv->vv_i__jm2k___7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__jm2k__.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 11);
|
||||
grid_dv->vv_i__j__kp2_7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__kp2.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 12);
|
||||
grid_dv->vv_i__j__km2_7 = (int*) allocate_and_copy_host_to_device_flattened_i(vv_i__j__km2.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 13);
|
||||
|
||||
grid_dv->vv_fac_a_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 57);
|
||||
grid_dv->vv_fac_b_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 58);
|
||||
grid_dv->vv_fac_c_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 59);
|
||||
grid_dv->vv_fac_f_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 60);
|
||||
grid_dv->vv_T0v_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 61);
|
||||
grid_dv->vv_T0r_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 62);
|
||||
grid_dv->vv_T0t_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 63);
|
||||
grid_dv->vv_T0p_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 64);
|
||||
grid_dv->vv_fun_0 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 65);
|
||||
grid_dv->vv_change_0 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(0), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 66);
|
||||
|
||||
grid_dv->vv_fac_a_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 67);
|
||||
grid_dv->vv_fac_b_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 68);
|
||||
grid_dv->vv_fac_c_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 69);
|
||||
grid_dv->vv_fac_f_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 70);
|
||||
grid_dv->vv_T0v_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 71);
|
||||
grid_dv->vv_T0r_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 72);
|
||||
grid_dv->vv_T0t_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 73);
|
||||
grid_dv->vv_T0p_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 74);
|
||||
grid_dv->vv_fun_1 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 75);
|
||||
grid_dv->vv_change_1 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(1), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 76);
|
||||
|
||||
grid_dv->vv_fac_a_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_2 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_2 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(2), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_3 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_3 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(3), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_4 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_4 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(4), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_5 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_5 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(5), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_6 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_6 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(6), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
grid_dv->vv_fac_a_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_a.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 77);
|
||||
grid_dv->vv_fac_b_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_b.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 78);
|
||||
grid_dv->vv_fac_c_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_c.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 79);
|
||||
grid_dv->vv_fac_f_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fac_f.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 80);
|
||||
grid_dv->vv_T0v_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0v.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 81);
|
||||
grid_dv->vv_T0r_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0r.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 82);
|
||||
grid_dv->vv_T0t_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0t.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 83);
|
||||
grid_dv->vv_T0p_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_T0p.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 84);
|
||||
grid_dv->vv_fun_7 = (CUSTOMREAL*) allocate_and_copy_host_to_device_flattened_cv( vv_fun.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 85);
|
||||
grid_dv->vv_change_7 = (bool*) allocate_and_copy_host_to_device_flattened_bl( vv_change.at(7), grid_dv->n_nodes_total_host, grid_dv->n_nodes_on_levels_host, 86);
|
||||
|
||||
// allocate tau
|
||||
print_CUDA_error_if_any(allocate_memory_on_device_cv((void**)&(grid_dv->tau), loc_I*loc_J*loc_K), 87);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void cuda_finalize_grid(Grid_on_device* grid_dv){
|
||||
// deallocate memory on device
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->n_nodes_on_levels), 10000);
|
||||
delete [] grid_dv->n_nodes_on_levels_host;
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___0), 1);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___0), 2);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___0), 3);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___0), 4);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___0), 5);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_0), 6);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_0), 7);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___1), 8);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___1), 9);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___1), 10);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___1), 11);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___1), 12);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_1), 13);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_1), 14);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___2), 15);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___2), 16);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___2), 17);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___2), 18);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___2), 19);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_2), 20);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_2), 21);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___3), 22);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___3), 23);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___3), 24);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___3), 25);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___3), 26);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_3), 27);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_3), 28);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___4), 29);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___4), 30);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___4), 31);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___4), 32);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___4), 33);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_4), 34);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_4), 35);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___5), 36);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___5), 37);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___5), 38);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___5), 39);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___5), 40);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_5), 41);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_5), 42);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___6), 43);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___6), 44);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___6), 45);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___6), 46);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___6), 47);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_6), 48);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_6), 49);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__k___7), 50);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip1j__k___7), 51);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im1j__k___7), 52);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp1k___7), 53);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm1k___7), 54);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp1_7), 55);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km1_7), 56);
|
||||
|
||||
if(grid_dv->if_3rd_order){
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___0), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___0), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___0), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___0), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_0), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_0), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___1), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___1), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___1), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___1), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_1), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_1), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___2), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___2), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___2), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___2), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_2), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_2), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___3), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___3), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___3), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___3), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_3), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_3), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___4), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___4), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___4), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___4), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_4), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_4), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___5), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___5), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___5), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___5), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_5), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_5), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___6), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___6), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___6), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___6), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_6), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_6), 10013);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_ip2j__k___7), 10008);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_im2j__k___7), 10009);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jp2k___7), 10010);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__jm2k___7), 10011);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__kp2_7), 10012);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_i(grid_dv->vv_i__j__km2_7), 10013);
|
||||
}
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_0), 10057);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_0), 10058);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_0), 10059);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_0), 10060);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_0), 10061);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_0), 10062);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_0), 10063);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_0), 10064);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_0), 10065);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_0), 10066);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_1), 10067);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_1), 10068);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_1), 10069);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_1), 10070);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_1), 10071);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_1), 10072);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_1), 10073);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_1), 10074);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_1), 10075);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_1), 10076);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_2), 10077);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_2), 10078);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_2), 10079);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_2), 10080);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_2), 10081);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_2), 10082);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_2), 10083);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_2), 10084);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_2), 10085);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_2), 10086);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_3), 10077);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_3), 10078);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_3), 10079);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_3), 10080);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_3), 10081);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_3), 10082);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_3), 10083);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_3), 10084);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_3), 10085);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_3), 10086);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_4), 10077);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_4), 10078);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_4), 10079);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_4), 10080);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_4), 10081);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_4), 10082);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_4), 10083);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_4), 10084);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_4), 10085);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_4), 10086);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_5), 10077);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_5), 10078);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_5), 10079);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_5), 10080);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_5), 10081);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_5), 10082);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_5), 10083);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_5), 10084);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_5), 10085);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_5), 10086);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_6), 10077);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_6), 10078);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_6), 10079);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_6), 10080);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_6), 10081);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_6), 10082);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_6), 10083);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_6), 10084);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_6), 10085);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_6), 10086);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_a_7), 10077);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_b_7), 10078);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_c_7), 10079);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fac_f_7), 10080);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0v_7), 10081);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0r_7), 10082);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0t_7), 10083);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_T0p_7), 10084);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv( grid_dv->vv_fun_7), 10085);
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_bl(grid_dv->vv_change_7), 10086);
|
||||
|
||||
print_CUDA_error_if_any(deallocate_memory_on_device_cv(grid_dv->tau), 10087);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// copy tau from host to device
|
||||
void cuda_copy_tau_to_device(Grid_on_device* grid_dv, CUSTOMREAL* tau_h){
|
||||
print_CUDA_error_if_any(copy_host_to_device_cv(grid_dv->tau, tau_h, grid_dv->loc_I_host*grid_dv->loc_J_host*grid_dv->loc_K_host), 10087);
|
||||
}
|
||||
|
||||
|
||||
// copy tau from device to host
|
||||
void cuda_copy_tau_to_host(Grid_on_device* grid_dv, CUSTOMREAL* tau_h){
|
||||
print_CUDA_error_if_any(copy_device_to_host_cv(tau_h, grid_dv->tau, grid_dv->loc_I_host*grid_dv->loc_J_host*grid_dv->loc_K_host), 10088);
|
||||
}
|
||||
|
||||
123
cuda/grid_wrapper.cuh
Normal file
123
cuda/grid_wrapper.cuh
Normal file
@@ -0,0 +1,123 @@
|
||||
#ifndef GRID_WRAPPER_CUH
|
||||
#define GRID_WRAPPER_CUH
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
//#include "config.h"
|
||||
#include "cuda_constants.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
// structure for storing grid information on device
|
||||
typedef struct Grid_on_device {
|
||||
|
||||
// parameters
|
||||
int loc_I_host, loc_J_host, loc_K_host;
|
||||
int n_nodes_total_host;
|
||||
int n_nodes_max_host=0;
|
||||
int n_levels_host;
|
||||
CUSTOMREAL dr_host, dt_host, dp_host;
|
||||
|
||||
// index storage
|
||||
int* n_nodes_on_levels, *n_nodes_on_levels_host;
|
||||
|
||||
int* vv_i__j__k___0, *vv_i__j__k___1, *vv_i__j__k___2, *vv_i__j__k___3, *vv_i__j__k___4, *vv_i__j__k___5, *vv_i__j__k___6, *vv_i__j__k___7;
|
||||
int* vv_ip1j__k___0, *vv_ip1j__k___1, *vv_ip1j__k___2, *vv_ip1j__k___3, *vv_ip1j__k___4, *vv_ip1j__k___5, *vv_ip1j__k___6, *vv_ip1j__k___7;
|
||||
int* vv_im1j__k___0, *vv_im1j__k___1, *vv_im1j__k___2, *vv_im1j__k___3, *vv_im1j__k___4, *vv_im1j__k___5, *vv_im1j__k___6, *vv_im1j__k___7;
|
||||
int* vv_i__jp1k___0, *vv_i__jp1k___1, *vv_i__jp1k___2, *vv_i__jp1k___3, *vv_i__jp1k___4, *vv_i__jp1k___5, *vv_i__jp1k___6, *vv_i__jp1k___7;
|
||||
int* vv_i__jm1k___0, *vv_i__jm1k___1, *vv_i__jm1k___2, *vv_i__jm1k___3, *vv_i__jm1k___4, *vv_i__jm1k___5, *vv_i__jm1k___6, *vv_i__jm1k___7;
|
||||
int* vv_i__j__kp1_0, *vv_i__j__kp1_1, *vv_i__j__kp1_2, *vv_i__j__kp1_3, *vv_i__j__kp1_4, *vv_i__j__kp1_5, *vv_i__j__kp1_6, *vv_i__j__kp1_7;
|
||||
int* vv_i__j__km1_0, *vv_i__j__km1_1, *vv_i__j__km1_2, *vv_i__j__km1_3, *vv_i__j__km1_4, *vv_i__j__km1_5, *vv_i__j__km1_6, *vv_i__j__km1_7;
|
||||
int* vv_ip2j__k___0, *vv_ip2j__k___1, *vv_ip2j__k___2, *vv_ip2j__k___3, *vv_ip2j__k___4, *vv_ip2j__k___5, *vv_ip2j__k___6, *vv_ip2j__k___7;
|
||||
int* vv_im2j__k___0, *vv_im2j__k___1, *vv_im2j__k___2, *vv_im2j__k___3, *vv_im2j__k___4, *vv_im2j__k___5, *vv_im2j__k___6, *vv_im2j__k___7;
|
||||
int* vv_i__jp2k___0, *vv_i__jp2k___1, *vv_i__jp2k___2, *vv_i__jp2k___3, *vv_i__jp2k___4, *vv_i__jp2k___5, *vv_i__jp2k___6, *vv_i__jp2k___7;
|
||||
int* vv_i__jm2k___0, *vv_i__jm2k___1, *vv_i__jm2k___2, *vv_i__jm2k___3, *vv_i__jm2k___4, *vv_i__jm2k___5, *vv_i__jm2k___6, *vv_i__jm2k___7;
|
||||
int* vv_i__j__kp2_0, *vv_i__j__kp2_1, *vv_i__j__kp2_2, *vv_i__j__kp2_3, *vv_i__j__kp2_4, *vv_i__j__kp2_5, *vv_i__j__kp2_6, *vv_i__j__kp2_7;
|
||||
int* vv_i__j__km2_0, *vv_i__j__km2_1, *vv_i__j__km2_2, *vv_i__j__km2_3, *vv_i__j__km2_4, *vv_i__j__km2_5, *vv_i__j__km2_6, *vv_i__j__km2_7;
|
||||
|
||||
// constants
|
||||
CUSTOMREAL* vv_fac_a_0, *vv_fac_a_1, *vv_fac_a_2, *vv_fac_a_3, *vv_fac_a_4, *vv_fac_a_5, *vv_fac_a_6, *vv_fac_a_7;
|
||||
CUSTOMREAL* vv_fac_b_0, *vv_fac_b_1, *vv_fac_b_2, *vv_fac_b_3, *vv_fac_b_4, *vv_fac_b_5, *vv_fac_b_6, *vv_fac_b_7;
|
||||
CUSTOMREAL* vv_fac_c_0, *vv_fac_c_1, *vv_fac_c_2, *vv_fac_c_3, *vv_fac_c_4, *vv_fac_c_5, *vv_fac_c_6, *vv_fac_c_7;
|
||||
CUSTOMREAL* vv_fac_f_0, *vv_fac_f_1, *vv_fac_f_2, *vv_fac_f_3, *vv_fac_f_4, *vv_fac_f_5, *vv_fac_f_6, *vv_fac_f_7;
|
||||
CUSTOMREAL* vv_T0v_0, *vv_T0v_1, *vv_T0v_2, *vv_T0v_3, *vv_T0v_4, *vv_T0v_5, *vv_T0v_6, *vv_T0v_7;
|
||||
CUSTOMREAL* vv_T0r_0, *vv_T0r_1, *vv_T0r_2, *vv_T0r_3, *vv_T0r_4, *vv_T0r_5, *vv_T0r_6, *vv_T0r_7;
|
||||
CUSTOMREAL* vv_T0t_0, *vv_T0t_1, *vv_T0t_2, *vv_T0t_3, *vv_T0t_4, *vv_T0t_5, *vv_T0t_6, *vv_T0t_7;
|
||||
CUSTOMREAL* vv_T0p_0, *vv_T0p_1, *vv_T0p_2, *vv_T0p_3, *vv_T0p_4, *vv_T0p_5, *vv_T0p_6, *vv_T0p_7;
|
||||
CUSTOMREAL* vv_fun_0, *vv_fun_1, *vv_fun_2, *vv_fun_3, *vv_fun_4, *vv_fun_5, *vv_fun_6, *vv_fun_7;
|
||||
bool* vv_change_0, *vv_change_1, *vv_change_2, *vv_change_3, *vv_change_4, *vv_change_5, *vv_change_6, *vv_change_7;
|
||||
|
||||
// temporary variables
|
||||
CUSTOMREAL* tau;
|
||||
|
||||
bool if_3rd_order = false;
|
||||
|
||||
// thead and grid for sweeping
|
||||
dim3 grid_sweep_host, threads_sweep_host;
|
||||
// array of streams
|
||||
cudaStream_t* level_streams;
|
||||
|
||||
|
||||
} Grid_on_device;
|
||||
|
||||
|
||||
void cuda_initialize_grid_1st(std::vector< std::vector<int> >& ijk, Grid_on_device* grid_dv, int const& loc_I, int const& loc_J, int const& loc_K,
|
||||
CUSTOMREAL const& dp, CUSTOMREAL const& dt, CUSTOMREAL const& dr, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_ip1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_im1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jp1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jm1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__kp1, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__km1, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_a, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_b, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_c, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_f, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0v, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0r, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0t, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0p, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fun, \
|
||||
std::vector<std::vector<bool*>> & vv_change);
|
||||
|
||||
void cuda_initialize_grid_3rd(std::vector< std::vector<int> >& ijk, Grid_on_device* grid_dv, int const& loc_I, int const& loc_J, int const& loc_K,
|
||||
CUSTOMREAL const& dp, CUSTOMREAL const& dt, CUSTOMREAL const& dr, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_ip1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_im1j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jp1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jm1k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__kp1, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__km1, \
|
||||
std::vector<std::vector<int*>> & vv_ip2j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_im2j__k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jp2k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__jm2k__, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__kp2, \
|
||||
std::vector<std::vector<int*>> & vv_i__j__km2, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_a, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_b, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_c, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fac_f, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0v, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0r, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0t, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_T0p, \
|
||||
std::vector<std::vector<CUSTOMREAL*>> & vv_fun, \
|
||||
std::vector<std::vector<bool*>> & vv_change);
|
||||
|
||||
|
||||
// finalize
|
||||
void cuda_finalize_grid(Grid_on_device* grid_dv);
|
||||
|
||||
// copy tau from host to device
|
||||
void cuda_copy_tau_to_device(Grid_on_device* grid_dv, CUSTOMREAL* tau_h);
|
||||
// copy tau from device to host
|
||||
void cuda_copy_tau_to_host(Grid_on_device* grid_dv, CUSTOMREAL* tau_h);
|
||||
|
||||
#endif // GRID_WRAPPER_CUH
|
||||
883
cuda/iterator_wrapper.cu
Normal file
883
cuda/iterator_wrapper.cu
Normal file
@@ -0,0 +1,883 @@
|
||||
#include "iterator_wrapper.cuh"
|
||||
|
||||
__device__ const CUSTOMREAL PLUS = 1.0;
|
||||
__device__ const CUSTOMREAL MINUS = -1.0;
|
||||
__device__ const CUSTOMREAL v_eps = 1e-12;
|
||||
|
||||
__device__ const CUSTOMREAL _0_5_CR = 0.5;
|
||||
__device__ const CUSTOMREAL _1_CR = 1.0;
|
||||
__device__ const CUSTOMREAL _2_CR = 2.0;
|
||||
__device__ const CUSTOMREAL _3_CR = 3.0;
|
||||
__device__ const CUSTOMREAL _4_CR = 4.0;
|
||||
|
||||
__device__ CUSTOMREAL my_square_cu(CUSTOMREAL const& x) {
|
||||
return x*x;
|
||||
}
|
||||
|
||||
__device__ CUSTOMREAL calc_stencil_1st(CUSTOMREAL const& a, CUSTOMREAL const& b, CUSTOMREAL const& Dinv){
|
||||
return Dinv*(a-b);
|
||||
}
|
||||
|
||||
__device__ CUSTOMREAL calc_stencil_3rd(CUSTOMREAL const& a, CUSTOMREAL const& b, CUSTOMREAL const& c, CUSTOMREAL const& d, CUSTOMREAL const& Dinv_half, CUSTOMREAL const& sign){
|
||||
CUSTOMREAL tmp1 = v_eps + my_square_cu(a-_2_CR*b+c);
|
||||
CUSTOMREAL tmp2 = v_eps + my_square_cu(d-_2_CR*a+b);
|
||||
CUSTOMREAL ww = _1_CR/(_1_CR+_2_CR*my_square_cu(tmp1/tmp2));
|
||||
return sign*((_1_CR-ww)* (b-d)*Dinv_half + ww*(-_3_CR*a+_4_CR*b-c)*Dinv_half);
|
||||
}
|
||||
|
||||
__device__ CUSTOMREAL cuda_calc_LF_Hamiltonian( \
|
||||
CUSTOMREAL const& fac_a_, \
|
||||
CUSTOMREAL const& fac_b_, \
|
||||
CUSTOMREAL const& fac_c_, \
|
||||
CUSTOMREAL const& fac_f_, \
|
||||
CUSTOMREAL const& T0r_, \
|
||||
CUSTOMREAL const& T0t_, \
|
||||
CUSTOMREAL const& T0p_, \
|
||||
CUSTOMREAL const& T0v_, \
|
||||
CUSTOMREAL& tau_, \
|
||||
CUSTOMREAL const& pp1, CUSTOMREAL& pp2, \
|
||||
CUSTOMREAL const& pt1, CUSTOMREAL& pt2, \
|
||||
CUSTOMREAL const& pr1, CUSTOMREAL& pr2 \
|
||||
) {
|
||||
// LF Hamiltonian for T = T0 * tau
|
||||
return sqrt(
|
||||
fac_a_ * my_square_cu(T0r_ * tau_ + T0v_ * (pr1+pr2)/_2_CR) \
|
||||
+ fac_b_ * my_square_cu(T0t_ * tau_ + T0v_ * (pt1+pt2)/_2_CR) \
|
||||
+ fac_c_ * my_square_cu(T0p_ * tau_ + T0v_ * (pp1+pp2)/_2_CR) \
|
||||
- _2_CR*fac_f_ * (T0t_ * tau_ + T0v_ * (pt1+pt2)/_2_CR) \
|
||||
* (T0p_ * tau_ + T0v_ * (pp1+pp2)/_2_CR) \
|
||||
);
|
||||
}
|
||||
|
||||
__global__ void cuda_do_sweep_level_kernel_1st(\
|
||||
const int i__j__k__[],\
|
||||
const int ip1j__k__[],\
|
||||
const int im1j__k__[],\
|
||||
const int i__jp1k__[],\
|
||||
const int i__jm1k__[],\
|
||||
const int i__j__kp1[],\
|
||||
const int i__j__km1[],\
|
||||
const CUSTOMREAL fac_a[], \
|
||||
const CUSTOMREAL fac_b[], \
|
||||
const CUSTOMREAL fac_c[], \
|
||||
const CUSTOMREAL fac_f[], \
|
||||
const CUSTOMREAL T0v[], \
|
||||
const CUSTOMREAL T0r[], \
|
||||
const CUSTOMREAL T0t[], \
|
||||
const CUSTOMREAL T0p[], \
|
||||
const CUSTOMREAL fun[], \
|
||||
const bool changed[], \
|
||||
CUSTOMREAL tau[], \
|
||||
const int loc_I, \
|
||||
const int loc_J, \
|
||||
const int loc_K, \
|
||||
const CUSTOMREAL dr, \
|
||||
const CUSTOMREAL dt, \
|
||||
const CUSTOMREAL dp, \
|
||||
const int n_nodes_this_level, \
|
||||
const int i_start \
|
||||
){
|
||||
|
||||
unsigned int i_node = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
|
||||
if (i_node >= n_nodes_this_level) return;
|
||||
|
||||
i_node += i_start;
|
||||
|
||||
//if (i_node >= loc_I*loc_J*loc_K) return;
|
||||
|
||||
if (changed[i_node] != true) return;
|
||||
|
||||
CUSTOMREAL sigr = _1_CR*sqrt(fac_a[i_node])*T0v[i_node];
|
||||
CUSTOMREAL sigt = _1_CR*sqrt(fac_b[i_node])*T0v[i_node];
|
||||
CUSTOMREAL sigp = _1_CR*sqrt(fac_c[i_node])*T0v[i_node];
|
||||
CUSTOMREAL coe = _1_CR/((sigr/dr)+(sigt/dt)+(sigp/dp));
|
||||
|
||||
CUSTOMREAL pp1 = calc_stencil_1st(tau[i__j__k__[i_node]],tau[im1j__k__[i_node]], _1_CR/dp);
|
||||
CUSTOMREAL pp2 = calc_stencil_1st(tau[ip1j__k__[i_node]],tau[i__j__k__[i_node]], _1_CR/dp);
|
||||
|
||||
CUSTOMREAL pt1 = calc_stencil_1st(tau[i__j__k__[i_node]],tau[i__jm1k__[i_node]], _1_CR/dt);
|
||||
CUSTOMREAL pt2 = calc_stencil_1st(tau[i__jp1k__[i_node]],tau[i__j__k__[i_node]], _1_CR/dt);
|
||||
|
||||
CUSTOMREAL pr1 = calc_stencil_1st(tau[i__j__k__[i_node]],tau[i__j__km1[i_node]], _1_CR/dr);
|
||||
CUSTOMREAL pr2 = calc_stencil_1st(tau[i__j__kp1[i_node]],tau[i__j__k__[i_node]], _1_CR/dr);
|
||||
|
||||
// LF Hamiltonian
|
||||
CUSTOMREAL Htau = cuda_calc_LF_Hamiltonian(\
|
||||
fac_a[i_node], \
|
||||
fac_b[i_node], \
|
||||
fac_c[i_node], \
|
||||
fac_f[i_node], \
|
||||
T0r[i_node], \
|
||||
T0t[i_node], \
|
||||
T0p[i_node], \
|
||||
T0v[i_node], \
|
||||
tau[i__j__k__[i_node]], \
|
||||
pp1, pp2, pt1, pt2, pr1, pr2);
|
||||
|
||||
tau[i__j__k__[i_node]] += coe*((fun[i_node] - Htau) \
|
||||
+(sigr*(pr2-pr1) \
|
||||
+ sigt*(pt2-pt1) \
|
||||
+ sigp*(pp2-pp1))/_2_CR);
|
||||
|
||||
}
|
||||
|
||||
__global__ void cuda_do_sweep_level_kernel_3rd(\
|
||||
const int i__j__k__[],\
|
||||
const int ip1j__k__[],\
|
||||
const int im1j__k__[],\
|
||||
const int i__jp1k__[],\
|
||||
const int i__jm1k__[],\
|
||||
const int i__j__kp1[],\
|
||||
const int i__j__km1[],\
|
||||
const int ip2j__k__[],\
|
||||
const int im2j__k__[],\
|
||||
const int i__jp2k__[],\
|
||||
const int i__jm2k__[],\
|
||||
const int i__j__kp2[],\
|
||||
const int i__j__km2[],\
|
||||
const CUSTOMREAL fac_a[], \
|
||||
const CUSTOMREAL fac_b[], \
|
||||
const CUSTOMREAL fac_c[], \
|
||||
const CUSTOMREAL fac_f[], \
|
||||
const CUSTOMREAL T0v[], \
|
||||
const CUSTOMREAL T0r[], \
|
||||
const CUSTOMREAL T0t[], \
|
||||
const CUSTOMREAL T0p[], \
|
||||
const CUSTOMREAL fun[], \
|
||||
const bool changed[], \
|
||||
CUSTOMREAL tau[], \
|
||||
const int loc_I, \
|
||||
const int loc_J, \
|
||||
const int loc_K, \
|
||||
const CUSTOMREAL dr, \
|
||||
const CUSTOMREAL dt, \
|
||||
const CUSTOMREAL dp, \
|
||||
const int n_nodes_this_level, \
|
||||
const int i_start \
|
||||
){
|
||||
|
||||
CUSTOMREAL pp1, pp2, pt1, pt2, pr1, pr2;
|
||||
|
||||
unsigned int i_node = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
|
||||
if (i_node >= n_nodes_this_level) return;
|
||||
|
||||
i_node += i_start;
|
||||
//if (i_node >= loc_I*loc_J*loc_K) return;
|
||||
|
||||
if (changed[i_node] != true) return;
|
||||
|
||||
int k = i__j__k__[i_node] / (loc_I*loc_J);
|
||||
int j = (i__j__k__[i_node] - k*loc_I*loc_J)/loc_I;
|
||||
int i = i__j__k__[i_node] - k*loc_I*loc_J - j*loc_I;
|
||||
|
||||
|
||||
CUSTOMREAL DRinv = _1_CR/dr;
|
||||
CUSTOMREAL DTinv = _1_CR/dt;
|
||||
CUSTOMREAL DPinv = _1_CR/dp;
|
||||
CUSTOMREAL DRinv_half = DRinv*_0_5_CR;
|
||||
CUSTOMREAL DTinv_half = DTinv*_0_5_CR;
|
||||
CUSTOMREAL DPinv_half = DPinv*_0_5_CR;
|
||||
|
||||
CUSTOMREAL sigr = _1_CR*sqrt(fac_a[i_node])*T0v[i_node];
|
||||
CUSTOMREAL sigt = _1_CR*sqrt(fac_b[i_node])*T0v[i_node];
|
||||
CUSTOMREAL sigp = _1_CR*sqrt(fac_c[i_node])*T0v[i_node];
|
||||
CUSTOMREAL coe = _1_CR/((sigr/dr)+(sigt/dt)+(sigp/dp));
|
||||
|
||||
// direction p
|
||||
if (i == 1) {
|
||||
pp1 = calc_stencil_1st(tau[i__j__k__[i_node]],tau[im1j__k__[i_node]],DPinv);
|
||||
pp2 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[ip1j__k__[i_node]],tau[ip2j__k__[i_node]],tau[im1j__k__[i_node]],DPinv_half, PLUS);
|
||||
} else if (i == loc_I-2) {
|
||||
pp1 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[im1j__k__[i_node]],tau[im2j__k__[i_node]],tau[ip1j__k__[i_node]],DPinv_half, MINUS);
|
||||
pp2 = calc_stencil_1st(tau[ip1j__k__[i_node]],tau[i__j__k__[i_node]],DPinv);
|
||||
} else {
|
||||
pp1 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[im1j__k__[i_node]],tau[im2j__k__[i_node]],tau[ip1j__k__[i_node]],DPinv_half, MINUS);
|
||||
pp2 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[ip1j__k__[i_node]],tau[ip2j__k__[i_node]],tau[im1j__k__[i_node]],DPinv_half, PLUS);
|
||||
}
|
||||
|
||||
// direction t
|
||||
if (j == 1) {
|
||||
pt1 = calc_stencil_1st(tau[i__j__k__[i_node]],tau[i__jm1k__[i_node]],DTinv);
|
||||
pt2 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__jp1k__[i_node]],tau[i__jp2k__[i_node]],tau[i__jm1k__[i_node]],DTinv_half, PLUS);
|
||||
} else if (j == loc_J-2) {
|
||||
pt1 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__jm1k__[i_node]],tau[i__jm2k__[i_node]],tau[i__jp1k__[i_node]],DTinv_half, MINUS);
|
||||
pt2 = calc_stencil_1st(tau[i__jp1k__[i_node]],tau[i__j__k__[i_node]],DTinv);
|
||||
} else {
|
||||
pt1 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__jm1k__[i_node]],tau[i__jm2k__[i_node]],tau[i__jp1k__[i_node]],DTinv_half, MINUS);
|
||||
pt2 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__jp1k__[i_node]],tau[i__jp2k__[i_node]],tau[i__jm1k__[i_node]],DTinv_half, PLUS);
|
||||
}
|
||||
|
||||
// direction r
|
||||
if (k == 1) {
|
||||
pr1 = calc_stencil_1st(tau[i__j__k__[i_node]],tau[i__j__km1[i_node]],DRinv);
|
||||
pr2 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__j__kp1[i_node]],tau[i__j__kp2[i_node]],tau[i__j__km1[i_node]],DRinv_half, PLUS);
|
||||
} else if (k == loc_K-2) {
|
||||
pr1 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__j__km1[i_node]],tau[i__j__km2[i_node]],tau[i__j__kp1[i_node]],DRinv_half, MINUS);
|
||||
pr2 = calc_stencil_1st(tau[i__j__kp1[i_node]],tau[i__j__k__[i_node]],DRinv);
|
||||
} else {
|
||||
pr1 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__j__km1[i_node]],tau[i__j__km2[i_node]],tau[i__j__kp1[i_node]],DRinv_half, MINUS);
|
||||
pr2 = calc_stencil_3rd(tau[i__j__k__[i_node]],tau[i__j__kp1[i_node]],tau[i__j__kp2[i_node]],tau[i__j__km1[i_node]],DRinv_half, PLUS);
|
||||
}
|
||||
|
||||
CUSTOMREAL Htau = cuda_calc_LF_Hamiltonian(\
|
||||
fac_a[i_node], \
|
||||
fac_b[i_node], \
|
||||
fac_c[i_node], \
|
||||
fac_f[i_node], \
|
||||
T0r[i_node], \
|
||||
T0t[i_node], \
|
||||
T0p[i_node], \
|
||||
T0v[i_node], \
|
||||
tau[i__j__k__[i_node]], \
|
||||
pp1, pp2, pt1, pt2, pr1, pr2);
|
||||
|
||||
tau[i__j__k__[i_node]] += coe*((fun[i_node] - Htau) \
|
||||
+(sigr*(pr2-pr1) \
|
||||
+ sigt*(pt2-pt1) \
|
||||
+ sigp*(pp2-pp1))/_2_CR);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void initialize_sweep_params(Grid_on_device* grid_dv){
|
||||
|
||||
// check the numBlockPerSm and set the block size accordingly
|
||||
//int numBlocksPerSm = 0;
|
||||
//int block_size = CUDA_SWEEPING_BLOCK_SIZE;
|
||||
|
||||
//int device;
|
||||
//cudaGetDevice(&device);
|
||||
|
||||
//cudaDeviceProp deviceProp;
|
||||
//cudaGetDeviceProperties(&deviceProp, device);
|
||||
//if(grid_dv->if_3rd_order)
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, cuda_do_sweep_level_kernel_3rd, CUDA_SWEEPING_BLOCK_SIZE, 0);
|
||||
//else
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, cuda_do_sweep_level_kernel_1st, CUDA_SWEEPING_BLOCK_SIZE, 0);
|
||||
|
||||
//int max_cooperative_blocks = deviceProp.multiProcessorCount*numBlocksPerSm;
|
||||
|
||||
//grid_dv->threads_sweep_host = dim3(block_size, 1, 1);
|
||||
//grid_dv->grid_sweep_host = dim3(max_cooperative_blocks, 1, 1);
|
||||
|
||||
// spawn streams
|
||||
//grid_dv->level_streams = (cudaStream_t*)malloc(CUDA_MAX_NUM_STREAMS*sizeof(cudaStream_t));
|
||||
//for (int i = 0; i < CUDA_MAX_NUM_STREAMS; i++) {
|
||||
grid_dv->level_streams = (cudaStream_t*)malloc(grid_dv->n_levels_host*sizeof(cudaStream_t));
|
||||
for (int i = 0; i < grid_dv->n_levels_host; i++) {
|
||||
//cudaStreamCreate(&(grid_dv->level_streams[i]));
|
||||
// add null
|
||||
//cudaStreamCreateWithFlags(&(grid_dv->level_streams[i]), cudaStreamNonBlocking);
|
||||
grid_dv->level_streams[i] = nullptr;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void finalize_sweep_params(Grid_on_device* grid_on_dv){
|
||||
// destroy streams
|
||||
//for (int i = 0; i < CUDA_MAX_NUM_STREAMS; i++) {
|
||||
//for (int i = 0; i < grid_on_dv->n_levels_host; i++) {
|
||||
// cudaStreamDestroy(grid_on_dv->level_streams[i]);
|
||||
//}
|
||||
|
||||
free(grid_on_dv->level_streams);
|
||||
}
|
||||
|
||||
|
||||
void run_kernel(Grid_on_device* grid_dv, int const& iswp, int& i_node_offset, int const& i_level, \
|
||||
dim3& grid_each, dim3& threads_each, int& n_nodes_this_level){
|
||||
|
||||
int id_stream = i_level;// % CUDA_MAX_NUM_STREAMS;
|
||||
|
||||
if (grid_dv->if_3rd_order) {
|
||||
if (iswp == 0){
|
||||
void *kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___0), \
|
||||
&(grid_dv->vv_ip1j__k___0), \
|
||||
&(grid_dv->vv_im1j__k___0), \
|
||||
&(grid_dv->vv_i__jp1k___0), \
|
||||
&(grid_dv->vv_i__jm1k___0), \
|
||||
&(grid_dv->vv_i__j__kp1_0), \
|
||||
&(grid_dv->vv_i__j__km1_0), \
|
||||
&(grid_dv->vv_ip2j__k___0), \
|
||||
&(grid_dv->vv_im2j__k___0), \
|
||||
&(grid_dv->vv_i__jp2k___0), \
|
||||
&(grid_dv->vv_i__jm2k___0), \
|
||||
&(grid_dv->vv_i__j__kp2_0), \
|
||||
&(grid_dv->vv_i__j__km2_0), \
|
||||
&(grid_dv->vv_fac_a_0 ), \
|
||||
&(grid_dv->vv_fac_b_0 ), \
|
||||
&(grid_dv->vv_fac_c_0 ), \
|
||||
&(grid_dv->vv_fac_f_0 ), \
|
||||
&(grid_dv->vv_T0v_0 ), \
|
||||
&(grid_dv->vv_T0r_0 ), \
|
||||
&(grid_dv->vv_T0t_0 ), \
|
||||
&(grid_dv->vv_T0p_0 ), \
|
||||
&(grid_dv->vv_fun_0 ), \
|
||||
&(grid_dv->vv_change_0 ), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 1){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___1), \
|
||||
&(grid_dv->vv_i__jp1k___1), \
|
||||
&(grid_dv->vv_i__jm1k___1), \
|
||||
&(grid_dv->vv_i__j__kp1_1), \
|
||||
&(grid_dv->vv_i__j__km1_1), \
|
||||
&(grid_dv->vv_ip1j__k___1), \
|
||||
&(grid_dv->vv_im1j__k___1), \
|
||||
&(grid_dv->vv_ip2j__k___1), \
|
||||
&(grid_dv->vv_im2j__k___1), \
|
||||
&(grid_dv->vv_i__jp2k___1), \
|
||||
&(grid_dv->vv_i__jm2k___1), \
|
||||
&(grid_dv->vv_i__j__kp2_1), \
|
||||
&(grid_dv->vv_i__j__km2_1), \
|
||||
&(grid_dv->vv_fac_a_1 ), \
|
||||
&(grid_dv->vv_fac_b_1 ), \
|
||||
&(grid_dv->vv_fac_c_1 ), \
|
||||
&(grid_dv->vv_fac_f_1 ), \
|
||||
&(grid_dv->vv_T0v_1 ), \
|
||||
&(grid_dv->vv_T0r_1 ), \
|
||||
&(grid_dv->vv_T0t_1 ), \
|
||||
&(grid_dv->vv_T0p_1 ), \
|
||||
&(grid_dv->vv_fun_1 ), \
|
||||
&(grid_dv->vv_change_1 ), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 2){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___2), \
|
||||
&(grid_dv->vv_i__j__kp1_2), \
|
||||
&(grid_dv->vv_i__j__km1_2), \
|
||||
&(grid_dv->vv_ip1j__k___2), \
|
||||
&(grid_dv->vv_im1j__k___2), \
|
||||
&(grid_dv->vv_i__jp1k___2), \
|
||||
&(grid_dv->vv_i__jm1k___2), \
|
||||
&(grid_dv->vv_ip2j__k___2), \
|
||||
&(grid_dv->vv_im2j__k___2), \
|
||||
&(grid_dv->vv_i__jp2k___2), \
|
||||
&(grid_dv->vv_i__jm2k___2), \
|
||||
&(grid_dv->vv_i__j__kp2_2), \
|
||||
&(grid_dv->vv_i__j__km2_2), \
|
||||
&(grid_dv->vv_fac_a_2), \
|
||||
&(grid_dv->vv_fac_b_2), \
|
||||
&(grid_dv->vv_fac_c_2), \
|
||||
&(grid_dv->vv_fac_f_2), \
|
||||
&(grid_dv->vv_T0v_2), \
|
||||
&(grid_dv->vv_T0r_2), \
|
||||
&(grid_dv->vv_T0t_2), \
|
||||
&(grid_dv->vv_T0p_2), \
|
||||
&(grid_dv->vv_fun_2), \
|
||||
&(grid_dv->vv_change_2), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 3){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___3), \
|
||||
&(grid_dv->vv_ip1j__k___3), \
|
||||
&(grid_dv->vv_im1j__k___3), \
|
||||
&(grid_dv->vv_i__jp1k___3), \
|
||||
&(grid_dv->vv_i__jm1k___3), \
|
||||
&(grid_dv->vv_i__j__kp1_3), \
|
||||
&(grid_dv->vv_i__j__km1_3), \
|
||||
&(grid_dv->vv_ip2j__k___3), \
|
||||
&(grid_dv->vv_im2j__k___3), \
|
||||
&(grid_dv->vv_i__jp2k___3), \
|
||||
&(grid_dv->vv_i__jm2k___3), \
|
||||
&(grid_dv->vv_i__j__kp2_3), \
|
||||
&(grid_dv->vv_i__j__km2_3), \
|
||||
&(grid_dv->vv_fac_a_3), \
|
||||
&(grid_dv->vv_fac_b_3), \
|
||||
&(grid_dv->vv_fac_c_3), \
|
||||
&(grid_dv->vv_fac_f_3), \
|
||||
&(grid_dv->vv_T0v_3), \
|
||||
&(grid_dv->vv_T0r_3), \
|
||||
&(grid_dv->vv_T0t_3), \
|
||||
&(grid_dv->vv_T0p_3), \
|
||||
&(grid_dv->vv_fun_3), \
|
||||
&(grid_dv->vv_change_3), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 4){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___4), \
|
||||
&(grid_dv->vv_ip1j__k___4), \
|
||||
&(grid_dv->vv_im1j__k___4), \
|
||||
&(grid_dv->vv_i__jp1k___4), \
|
||||
&(grid_dv->vv_i__jm1k___4), \
|
||||
&(grid_dv->vv_i__j__kp1_4), \
|
||||
&(grid_dv->vv_i__j__km1_4), \
|
||||
&(grid_dv->vv_ip2j__k___4), \
|
||||
&(grid_dv->vv_im2j__k___4), \
|
||||
&(grid_dv->vv_i__jp2k___4), \
|
||||
&(grid_dv->vv_i__jm2k___4), \
|
||||
&(grid_dv->vv_i__j__kp2_4), \
|
||||
&(grid_dv->vv_i__j__km2_4), \
|
||||
&(grid_dv->vv_fac_a_4), \
|
||||
&(grid_dv->vv_fac_b_4), \
|
||||
&(grid_dv->vv_fac_c_4), \
|
||||
&(grid_dv->vv_fac_f_4), \
|
||||
&(grid_dv->vv_T0v_4), \
|
||||
&(grid_dv->vv_T0r_4), \
|
||||
&(grid_dv->vv_T0t_4), \
|
||||
&(grid_dv->vv_T0p_4), \
|
||||
&(grid_dv->vv_fun_4), \
|
||||
&(grid_dv->vv_change_4), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 5) {
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___5), \
|
||||
&(grid_dv->vv_ip1j__k___5), \
|
||||
&(grid_dv->vv_im1j__k___5), \
|
||||
&(grid_dv->vv_i__jp1k___5), \
|
||||
&(grid_dv->vv_i__jm1k___5), \
|
||||
&(grid_dv->vv_i__j__kp1_5), \
|
||||
&(grid_dv->vv_i__j__km1_5), \
|
||||
&(grid_dv->vv_ip2j__k___5), \
|
||||
&(grid_dv->vv_im2j__k___5), \
|
||||
&(grid_dv->vv_i__jp2k___5), \
|
||||
&(grid_dv->vv_i__jm2k___5), \
|
||||
&(grid_dv->vv_i__j__kp2_5), \
|
||||
&(grid_dv->vv_i__j__km2_5), \
|
||||
&(grid_dv->vv_fac_a_5), \
|
||||
&(grid_dv->vv_fac_b_5), \
|
||||
&(grid_dv->vv_fac_c_5), \
|
||||
&(grid_dv->vv_fac_f_5), \
|
||||
&(grid_dv->vv_T0v_5), \
|
||||
&(grid_dv->vv_T0r_5), \
|
||||
&(grid_dv->vv_T0t_5), \
|
||||
&(grid_dv->vv_T0p_5), \
|
||||
&(grid_dv->vv_fun_5), \
|
||||
&(grid_dv->vv_change_5), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 6) {
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___6), \
|
||||
&(grid_dv->vv_ip1j__k___6), \
|
||||
&(grid_dv->vv_im1j__k___6), \
|
||||
&(grid_dv->vv_i__jp1k___6), \
|
||||
&(grid_dv->vv_i__jm1k___6), \
|
||||
&(grid_dv->vv_i__j__kp1_6), \
|
||||
&(grid_dv->vv_i__j__km1_6), \
|
||||
&(grid_dv->vv_ip2j__k___6), \
|
||||
&(grid_dv->vv_im2j__k___6), \
|
||||
&(grid_dv->vv_i__jp2k___6), \
|
||||
&(grid_dv->vv_i__jm2k___6), \
|
||||
&(grid_dv->vv_i__j__kp2_6), \
|
||||
&(grid_dv->vv_i__j__km2_6), \
|
||||
&(grid_dv->vv_fac_a_6), \
|
||||
&(grid_dv->vv_fac_b_6), \
|
||||
&(grid_dv->vv_fac_c_6), \
|
||||
&(grid_dv->vv_fac_f_6), \
|
||||
&(grid_dv->vv_T0v_6), \
|
||||
&(grid_dv->vv_T0r_6), \
|
||||
&(grid_dv->vv_T0t_6), \
|
||||
&(grid_dv->vv_T0p_6), \
|
||||
&(grid_dv->vv_fun_6), \
|
||||
&(grid_dv->vv_change_6), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else {
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___7), \
|
||||
&(grid_dv->vv_ip1j__k___7), \
|
||||
&(grid_dv->vv_im1j__k___7), \
|
||||
&(grid_dv->vv_i__jp1k___7), \
|
||||
&(grid_dv->vv_i__jm1k___7), \
|
||||
&(grid_dv->vv_i__j__kp1_7), \
|
||||
&(grid_dv->vv_i__j__km1_7), \
|
||||
&(grid_dv->vv_ip2j__k___7), \
|
||||
&(grid_dv->vv_im2j__k___7), \
|
||||
&(grid_dv->vv_i__jp2k___7), \
|
||||
&(grid_dv->vv_i__jm2k___7), \
|
||||
&(grid_dv->vv_i__j__kp2_7), \
|
||||
&(grid_dv->vv_i__j__km2_7), \
|
||||
&(grid_dv->vv_fac_a_7), \
|
||||
&(grid_dv->vv_fac_b_7), \
|
||||
&(grid_dv->vv_fac_c_7), \
|
||||
&(grid_dv->vv_fac_f_7), \
|
||||
&(grid_dv->vv_T0v_7), \
|
||||
&(grid_dv->vv_T0r_7), \
|
||||
&(grid_dv->vv_T0t_7), \
|
||||
&(grid_dv->vv_T0p_7), \
|
||||
&(grid_dv->vv_fun_7), \
|
||||
&(grid_dv->vv_change_7), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_3rd, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
}
|
||||
} else { // 1st order
|
||||
if (iswp == 0){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___0), \
|
||||
&(grid_dv->vv_ip1j__k___0), \
|
||||
&(grid_dv->vv_im1j__k___0), \
|
||||
&(grid_dv->vv_i__jp1k___0), \
|
||||
&(grid_dv->vv_i__jm1k___0), \
|
||||
&(grid_dv->vv_i__j__kp1_0), \
|
||||
&(grid_dv->vv_i__j__km1_0), \
|
||||
&(grid_dv->vv_fac_a_0), \
|
||||
&(grid_dv->vv_fac_b_0), \
|
||||
&(grid_dv->vv_fac_c_0), \
|
||||
&(grid_dv->vv_fac_f_0), \
|
||||
&(grid_dv->vv_T0v_0), \
|
||||
&(grid_dv->vv_T0r_0), \
|
||||
&(grid_dv->vv_T0t_0), \
|
||||
&(grid_dv->vv_T0p_0), \
|
||||
&(grid_dv->vv_fun_0), \
|
||||
&(grid_dv->vv_change_0), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30000);
|
||||
|
||||
} else if (iswp == 1){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___1), \
|
||||
&(grid_dv->vv_i__jp1k___1), \
|
||||
&(grid_dv->vv_i__jm1k___1), \
|
||||
&(grid_dv->vv_i__j__kp1_1), \
|
||||
&(grid_dv->vv_i__j__km1_1), \
|
||||
&(grid_dv->vv_ip1j__k___1), \
|
||||
&(grid_dv->vv_im1j__k___1), \
|
||||
&(grid_dv->vv_fac_a_1), \
|
||||
&(grid_dv->vv_fac_b_1), \
|
||||
&(grid_dv->vv_fac_c_1), \
|
||||
&(grid_dv->vv_fac_f_1), \
|
||||
&(grid_dv->vv_T0v_1), \
|
||||
&(grid_dv->vv_T0r_1), \
|
||||
&(grid_dv->vv_T0t_1), \
|
||||
&(grid_dv->vv_T0p_1), \
|
||||
&(grid_dv->vv_fun_1), \
|
||||
&(grid_dv->vv_change_1), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30001);
|
||||
|
||||
} else if (iswp == 2){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___2), \
|
||||
&(grid_dv->vv_i__j__kp1_2), \
|
||||
&(grid_dv->vv_i__j__km1_2), \
|
||||
&(grid_dv->vv_ip1j__k___2), \
|
||||
&(grid_dv->vv_im1j__k___2), \
|
||||
&(grid_dv->vv_i__jp1k___2), \
|
||||
&(grid_dv->vv_i__jm1k___2), \
|
||||
&(grid_dv->vv_fac_a_2), \
|
||||
&(grid_dv->vv_fac_b_2), \
|
||||
&(grid_dv->vv_fac_c_2), \
|
||||
&(grid_dv->vv_fac_f_2), \
|
||||
&(grid_dv->vv_T0v_2), \
|
||||
&(grid_dv->vv_T0r_2), \
|
||||
&(grid_dv->vv_T0t_2), \
|
||||
&(grid_dv->vv_T0p_2), \
|
||||
&(grid_dv->vv_fun_2), \
|
||||
&(grid_dv->vv_change_2), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30002);
|
||||
|
||||
} else if (iswp == 3){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___3), \
|
||||
&(grid_dv->vv_ip1j__k___3), \
|
||||
&(grid_dv->vv_im1j__k___3), \
|
||||
&(grid_dv->vv_i__jp1k___3), \
|
||||
&(grid_dv->vv_i__jm1k___3), \
|
||||
&(grid_dv->vv_i__j__kp1_3), \
|
||||
&(grid_dv->vv_i__j__km1_3), \
|
||||
&(grid_dv->vv_fac_a_3), \
|
||||
&(grid_dv->vv_fac_b_3), \
|
||||
&(grid_dv->vv_fac_c_3), \
|
||||
&(grid_dv->vv_fac_f_3), \
|
||||
&(grid_dv->vv_T0v_3), \
|
||||
&(grid_dv->vv_T0r_3), \
|
||||
&(grid_dv->vv_T0t_3), \
|
||||
&(grid_dv->vv_T0p_3), \
|
||||
&(grid_dv->vv_fun_3), \
|
||||
&(grid_dv->vv_change_3), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30003);
|
||||
|
||||
} else if (iswp == 4){
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___4), \
|
||||
&(grid_dv->vv_ip1j__k___4), \
|
||||
&(grid_dv->vv_im1j__k___4), \
|
||||
&(grid_dv->vv_i__jp1k___4), \
|
||||
&(grid_dv->vv_i__jm1k___4), \
|
||||
&(grid_dv->vv_i__j__kp1_4), \
|
||||
&(grid_dv->vv_i__j__km1_4), \
|
||||
&(grid_dv->vv_fac_a_4), \
|
||||
&(grid_dv->vv_fac_b_4), \
|
||||
&(grid_dv->vv_fac_c_4), \
|
||||
&(grid_dv->vv_fac_f_4), \
|
||||
&(grid_dv->vv_T0v_4), \
|
||||
&(grid_dv->vv_T0r_4), \
|
||||
&(grid_dv->vv_T0t_4), \
|
||||
&(grid_dv->vv_T0p_4), \
|
||||
&(grid_dv->vv_fun_4), \
|
||||
&(grid_dv->vv_change_4), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30004);
|
||||
|
||||
} else if (iswp == 5) {
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___5), \
|
||||
&(grid_dv->vv_ip1j__k___5), \
|
||||
&(grid_dv->vv_im1j__k___5), \
|
||||
&(grid_dv->vv_i__jp1k___5), \
|
||||
&(grid_dv->vv_i__jm1k___5), \
|
||||
&(grid_dv->vv_i__j__kp1_5), \
|
||||
&(grid_dv->vv_i__j__km1_5), \
|
||||
&(grid_dv->vv_fac_a_5), \
|
||||
&(grid_dv->vv_fac_b_5), \
|
||||
&(grid_dv->vv_fac_c_5), \
|
||||
&(grid_dv->vv_fac_f_5), \
|
||||
&(grid_dv->vv_T0v_5), \
|
||||
&(grid_dv->vv_T0r_5), \
|
||||
&(grid_dv->vv_T0t_5), \
|
||||
&(grid_dv->vv_T0p_5), \
|
||||
&(grid_dv->vv_fun_5), \
|
||||
&(grid_dv->vv_change_5), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30005);
|
||||
|
||||
} else if (iswp == 6) {
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___6), \
|
||||
&(grid_dv->vv_ip1j__k___6), \
|
||||
&(grid_dv->vv_im1j__k___6), \
|
||||
&(grid_dv->vv_i__jp1k___6), \
|
||||
&(grid_dv->vv_i__jm1k___6), \
|
||||
&(grid_dv->vv_i__j__kp1_6), \
|
||||
&(grid_dv->vv_i__j__km1_6), \
|
||||
&(grid_dv->vv_fac_a_6), \
|
||||
&(grid_dv->vv_fac_b_6), \
|
||||
&(grid_dv->vv_fac_c_6), \
|
||||
&(grid_dv->vv_fac_f_6), \
|
||||
&(grid_dv->vv_T0v_6), \
|
||||
&(grid_dv->vv_T0r_6), \
|
||||
&(grid_dv->vv_T0t_6), \
|
||||
&(grid_dv->vv_T0p_6), \
|
||||
&(grid_dv->vv_fun_6), \
|
||||
&(grid_dv->vv_change_6), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30006);
|
||||
|
||||
|
||||
} else {
|
||||
void* kernelArgs[]{\
|
||||
&(grid_dv->vv_i__j__k___7), \
|
||||
&(grid_dv->vv_ip1j__k___7), \
|
||||
&(grid_dv->vv_im1j__k___7), \
|
||||
&(grid_dv->vv_i__jp1k___7), \
|
||||
&(grid_dv->vv_i__jm1k___7), \
|
||||
&(grid_dv->vv_i__j__kp1_7), \
|
||||
&(grid_dv->vv_i__j__km1_7), \
|
||||
&(grid_dv->vv_fac_a_7 ), \
|
||||
&(grid_dv->vv_fac_b_7 ), \
|
||||
&(grid_dv->vv_fac_c_7 ), \
|
||||
&(grid_dv->vv_fac_f_7 ), \
|
||||
&(grid_dv->vv_T0v_7 ), \
|
||||
&(grid_dv->vv_T0r_7 ), \
|
||||
&(grid_dv->vv_T0t_7 ), \
|
||||
&(grid_dv->vv_T0p_7 ), \
|
||||
&(grid_dv->vv_fun_7 ), \
|
||||
&(grid_dv->vv_change_7 ), \
|
||||
&(grid_dv->tau), \
|
||||
&(grid_dv->loc_I_host), \
|
||||
&(grid_dv->loc_J_host), \
|
||||
&(grid_dv->loc_K_host), \
|
||||
&(grid_dv->dr_host), \
|
||||
&(grid_dv->dt_host), \
|
||||
&(grid_dv->dp_host), \
|
||||
&n_nodes_this_level, \
|
||||
&i_node_offset \
|
||||
};
|
||||
|
||||
print_CUDA_error_if_any(cudaLaunchKernel((void*) cuda_do_sweep_level_kernel_1st, grid_each, threads_each, kernelArgs, 0, grid_dv->level_streams[id_stream]), 30007);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// synchronize all streams
|
||||
//print_CUDA_error_if_any(cudaStreamSynchronize(grid_dv->level_streams[id_stream]), 30008);
|
||||
}
|
||||
|
||||
|
||||
// this function calculate all levels of one single sweep direction
|
||||
void cuda_run_iteration_forward(Grid_on_device* grid_dv, int const& iswp){
|
||||
|
||||
initialize_sweep_params(grid_dv);
|
||||
|
||||
int block_size = CUDA_SWEEPING_BLOCK_SIZE;
|
||||
int num_blocks_x, num_blocks_y;
|
||||
int i_node_offset=0;
|
||||
//get_block_xy(ceil(grid_dv->n_nodes_max_host/block_size+0.5), &num_blocks_x, &num_blocks_y);
|
||||
//dim3 grid_each(num_blocks_x, num_blocks_y);
|
||||
//dim3 threads_each(block_size, 1, 1);
|
||||
|
||||
for (size_t i_level = 0; i_level < grid_dv->n_levels_host; i_level++){
|
||||
get_block_xy(ceil(grid_dv->n_nodes_on_levels_host[i_level]/block_size+0.5), &num_blocks_x, &num_blocks_y);
|
||||
dim3 grid_each(num_blocks_x, num_blocks_y);
|
||||
dim3 threads_each(block_size, 1, 1);
|
||||
|
||||
run_kernel(grid_dv, iswp, i_node_offset, i_level, grid_each, threads_each, grid_dv->n_nodes_on_levels_host[i_level]);
|
||||
//run_kernel(grid_dv, iswp, i_node_offset, i_level, grid_dv->grid_sweep_host, grid_dv->threads_sweep_host, grid_dv->n_nodes_on_levels_host[i_level]);
|
||||
|
||||
i_node_offset += grid_dv->n_nodes_on_levels_host[i_level];
|
||||
}
|
||||
|
||||
finalize_sweep_params(grid_dv);
|
||||
|
||||
// check memory leak
|
||||
//print_memory_usage();
|
||||
|
||||
}
|
||||
27
cuda/iterator_wrapper.cuh
Normal file
27
cuda/iterator_wrapper.cuh
Normal file
@@ -0,0 +1,27 @@
|
||||
#ifndef ITERATOR_WRAPPER_CUH
|
||||
#define ITERATOR_WRAPPER_CUH
|
||||
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include "grid_wrapper.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
|
||||
//void cuda_do_sweep_level_kernel_3rd();
|
||||
//void cuda_do_sweep_level_kernel_1st();
|
||||
|
||||
void run_kernel(Grid_on_device*, int const&, int const&, int const&, dim3&, dim3&, int const&);
|
||||
|
||||
void initialize_sweep_params(Grid_on_device*);
|
||||
void finalize_sweep_params(Grid_on_device*);
|
||||
void cuda_run_iteration_forward(Grid_on_device*, int const&);
|
||||
|
||||
|
||||
#endif // ITERATOR_WRAPPER_CUH
|
||||
Reference in New Issue
Block a user