cuda/cuda_initialize.cuh

#ifndef CUDA_INITIALIZE_H
#define CUDA_INITIALIZE_H


#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

//#include "config.h"
#include "cuda_constants.cuh"

void get_free_memory(double* free_db, double* used_db, double* total_db) {

    // gets memory usage in byte
    size_t free_byte ;
    size_t total_byte ;
    cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
    if ( cudaSuccess != cuda_status ){
        printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
        exit(EXIT_FAILURE);
    }

    *free_db = (double)free_byte ;
    *total_db = (double)total_byte ;
    *used_db = *total_db - *free_db ;
    return;
}


// setup cuda constants and variables by reading device properties
void initialize_cuda(){

    std::cout << "Initializing CUDA..." << std::endl;

    int ncuda_device;
    int device;

    // count number of devices
    cudaGetDeviceCount(&ncuda_device);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("cudaGetDeviceCount returned error code %d after %d devices\n", err, ncuda_device);
        exit(1);
    }

    if (ncuda_device == 0)
    {
        printf("There is no device supporting CUDA\n");
        exit(1);
    }

    // set the active device
    if (ncuda_device >= 1){
        cudaDeviceReset();

        device = world_rank % ncuda_device;
        cudaSetDevice(device);

        cudaFree(0);

        // check device is set
        cudaGetDevice(&device);
        if (device != world_rank % ncuda_device){
            printf("Error: Could not set device to %d\n", world_rank % ncuda_device);
            exit(1);
        }
    } // end if ncuda_device >= 1

    cudaGetDevice(&device);

    // get device properties
    cudaDeviceProp deviceProp; // in cuda_constants
    cudaGetDeviceProperties(&deviceProp, device);

    // exit if machine has no cuda enable device
    if (deviceProp.major == 9999 && deviceProp.minor == 9999){
        printf("Error: No CUDA device found\n");
        exit(1);
    }

    // print device properties
    char filename[256];

    if (world_rank == 0){
        sprintf(filename, "cuda_device_info.txt");
        FILE *fp = fopen(filename, "w");

        if(fp == NULL){
            printf("Error: Could not open file %s\n", filename);
            exit(1);
        }

        // display device properties
        fprintf(fp,"Device Name = %s\n",deviceProp.name);
        fprintf(fp,"memory:\n");
        fprintf(fp,"  totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
        fprintf(fp,"  totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
        fprintf(fp,"  totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem);
        fprintf(fp,"  Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D);
        fprintf(fp,"  sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
        fprintf(fp,"  regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);
        fprintf(fp,"blocks:\n");
        fprintf(fp,"  Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
        fprintf(fp,"  Maximum size of each dimension of a block: %d x %d x %d\n",
                deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
        fprintf(fp,"  Maximum sizes of each dimension of a grid: %d x %d x %d\n",
                deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
        fprintf(fp,"features:\n");
        fprintf(fp,"  Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
        fprintf(fp,"  multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
        if (deviceProp.canMapHostMemory){
          fprintf(fp,"  canMapHostMemory: TRUE\n");
        }else{
          fprintf(fp,"  canMapHostMemory: FALSE\n");
        }
        if (deviceProp.deviceOverlap){
          fprintf(fp,"  deviceOverlap: TRUE\n");
        }else{
          fprintf(fp,"  deviceOverlap: FALSE\n");
        }
        if (deviceProp.concurrentKernels){
          fprintf(fp,"  concurrentKernels: TRUE\n");
        }else{
          fprintf(fp,"  concurrentKernels: FALSE\n");
        }
        // outputs initial memory infos via cudaMemGetInfo()
        double free_db,used_db,total_db;
        get_free_memory(&free_db,&used_db,&total_db);
        fprintf(fp,"memory usage:\n");
        fprintf(fp,"  rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
                used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);

        // closes output file
        fclose(fp);
    }

}


void finalize_cuda(){
    cudaDeviceReset();
}

#endif // CUDA_INITIALIZE_H
initial upload 2025-12-17 10:53:43 +08:00			`#ifndef CUDA_INITIALIZE_H`
			`#define CUDA_INITIALIZE_H`


			`#include <cuda_runtime.h>`
			`#include <cuda.h>`
			`#include <cuda_runtime_api.h>`

			`//#include "config.h"`
			`#include "cuda_constants.cuh"`

			`void get_free_memory(double* free_db, double* used_db, double* total_db) {`

			`// gets memory usage in byte`
			`size_t free_byte ;`
			`size_t total_byte ;`
			`cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;`
			`if ( cudaSuccess != cuda_status ){`
			`printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );`
			`exit(EXIT_FAILURE);`
			`}`

			`*free_db = (double)free_byte ;`
			`*total_db = (double)total_byte ;`
			`used_db = total_db - *free_db ;`
			`return;`
			`}`


			`// setup cuda constants and variables by reading device properties`
			`void initialize_cuda(){`

			`std::cout << "Initializing CUDA..." << std::endl;`

			`int ncuda_device;`
			`int device;`

			`// count number of devices`
			`cudaGetDeviceCount(&ncuda_device);`

			`cudaError_t err = cudaGetLastError();`
			`if (err != cudaSuccess) {`
			`printf("cudaGetDeviceCount returned error code %d after %d devices\n", err, ncuda_device);`
			`exit(1);`
			`}`

			`if (ncuda_device == 0)`
			`{`
			`printf("There is no device supporting CUDA\n");`
			`exit(1);`
			`}`

			`// set the active device`
			`if (ncuda_device >= 1){`
			`cudaDeviceReset();`

			`device = world_rank % ncuda_device;`
			`cudaSetDevice(device);`

			`cudaFree(0);`

			`// check device is set`
			`cudaGetDevice(&device);`
			`if (device != world_rank % ncuda_device){`
			`printf("Error: Could not set device to %d\n", world_rank % ncuda_device);`
			`exit(1);`
			`}`
			`} // end if ncuda_device >= 1`

			`cudaGetDevice(&device);`

			`// get device properties`
			`cudaDeviceProp deviceProp; // in cuda_constants`
			`cudaGetDeviceProperties(&deviceProp, device);`

			`// exit if machine has no cuda enable device`
			`if (deviceProp.major == 9999 && deviceProp.minor == 9999){`
			`printf("Error: No CUDA device found\n");`
			`exit(1);`
			`}`

			`// print device properties`
			`char filename[256];`

			`if (world_rank == 0){`
			`sprintf(filename, "cuda_device_info.txt");`
			`FILE *fp = fopen(filename, "w");`

			`if(fp == NULL){`
			`printf("Error: Could not open file %s\n", filename);`
			`exit(1);`
			`}`

			`// display device properties`
			`fprintf(fp,"Device Name = %s\n",deviceProp.name);`
			`fprintf(fp,"memory:\n");`
			`fprintf(fp," totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));`
			`fprintf(fp," totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));`
			`fprintf(fp," totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem);`
			`fprintf(fp," Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D);`
			`fprintf(fp," sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);`
			`fprintf(fp," regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);`
			`fprintf(fp,"blocks:\n");`
			`fprintf(fp," Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);`
			`fprintf(fp," Maximum size of each dimension of a block: %d x %d x %d\n",`
			`deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);`
			`fprintf(fp," Maximum sizes of each dimension of a grid: %d x %d x %d\n",`
			`deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);`
			`fprintf(fp,"features:\n");`
			`fprintf(fp," Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);`
			`fprintf(fp," multiProcessorCount: %d\n",deviceProp.multiProcessorCount);`
			`if (deviceProp.canMapHostMemory){`
			`fprintf(fp," canMapHostMemory: TRUE\n");`
			`}else{`
			`fprintf(fp," canMapHostMemory: FALSE\n");`
			`}`
			`if (deviceProp.deviceOverlap){`
			`fprintf(fp," deviceOverlap: TRUE\n");`
			`}else{`
			`fprintf(fp," deviceOverlap: FALSE\n");`
			`}`
			`if (deviceProp.concurrentKernels){`
			`fprintf(fp," concurrentKernels: TRUE\n");`
			`}else{`
			`fprintf(fp," concurrentKernels: FALSE\n");`
			`}`
			`// outputs initial memory infos via cudaMemGetInfo()`
			`double free_db,used_db,total_db;`
			`get_free_memory(&free_db,&used_db,&total_db);`
			`fprintf(fp,"memory usage:\n");`
			`fprintf(fp," rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,`
			`used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);`

			`// closes output file`
			`fclose(fp);`
			`}`

			`}`


			`void finalize_cuda(){`
			`cudaDeviceReset();`
			`}`

			`#endif // CUDA_INITIALIZE_H`