145 lines
4.8 KiB
Plaintext
145 lines
4.8 KiB
Plaintext
|
|
#ifndef CUDA_INITIALIZE_H
|
||
|
|
#define CUDA_INITIALIZE_H
|
||
|
|
|
||
|
|
|
||
|
|
#include <cuda_runtime.h>
|
||
|
|
#include <cuda.h>
|
||
|
|
#include <cuda_runtime_api.h>
|
||
|
|
|
||
|
|
//#include "config.h"
|
||
|
|
#include "cuda_constants.cuh"
|
||
|
|
|
||
|
|
void get_free_memory(double* free_db, double* used_db, double* total_db) {
|
||
|
|
|
||
|
|
// gets memory usage in byte
|
||
|
|
size_t free_byte ;
|
||
|
|
size_t total_byte ;
|
||
|
|
cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
|
||
|
|
if ( cudaSuccess != cuda_status ){
|
||
|
|
printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
|
||
|
|
exit(EXIT_FAILURE);
|
||
|
|
}
|
||
|
|
|
||
|
|
*free_db = (double)free_byte ;
|
||
|
|
*total_db = (double)total_byte ;
|
||
|
|
*used_db = *total_db - *free_db ;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
// setup cuda constants and variables by reading device properties
|
||
|
|
void initialize_cuda(){
|
||
|
|
|
||
|
|
std::cout << "Initializing CUDA..." << std::endl;
|
||
|
|
|
||
|
|
int ncuda_device;
|
||
|
|
int device;
|
||
|
|
|
||
|
|
// count number of devices
|
||
|
|
cudaGetDeviceCount(&ncuda_device);
|
||
|
|
|
||
|
|
cudaError_t err = cudaGetLastError();
|
||
|
|
if (err != cudaSuccess) {
|
||
|
|
printf("cudaGetDeviceCount returned error code %d after %d devices\n", err, ncuda_device);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ncuda_device == 0)
|
||
|
|
{
|
||
|
|
printf("There is no device supporting CUDA\n");
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// set the active device
|
||
|
|
if (ncuda_device >= 1){
|
||
|
|
cudaDeviceReset();
|
||
|
|
|
||
|
|
device = world_rank % ncuda_device;
|
||
|
|
cudaSetDevice(device);
|
||
|
|
|
||
|
|
cudaFree(0);
|
||
|
|
|
||
|
|
// check device is set
|
||
|
|
cudaGetDevice(&device);
|
||
|
|
if (device != world_rank % ncuda_device){
|
||
|
|
printf("Error: Could not set device to %d\n", world_rank % ncuda_device);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
} // end if ncuda_device >= 1
|
||
|
|
|
||
|
|
cudaGetDevice(&device);
|
||
|
|
|
||
|
|
// get device properties
|
||
|
|
cudaDeviceProp deviceProp; // in cuda_constants
|
||
|
|
cudaGetDeviceProperties(&deviceProp, device);
|
||
|
|
|
||
|
|
// exit if machine has no cuda enable device
|
||
|
|
if (deviceProp.major == 9999 && deviceProp.minor == 9999){
|
||
|
|
printf("Error: No CUDA device found\n");
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// print device properties
|
||
|
|
char filename[256];
|
||
|
|
|
||
|
|
if (world_rank == 0){
|
||
|
|
sprintf(filename, "cuda_device_info.txt");
|
||
|
|
FILE *fp = fopen(filename, "w");
|
||
|
|
|
||
|
|
if(fp == NULL){
|
||
|
|
printf("Error: Could not open file %s\n", filename);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// display device properties
|
||
|
|
fprintf(fp,"Device Name = %s\n",deviceProp.name);
|
||
|
|
fprintf(fp,"memory:\n");
|
||
|
|
fprintf(fp," totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
|
||
|
|
fprintf(fp," totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
|
||
|
|
fprintf(fp," totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem);
|
||
|
|
fprintf(fp," Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D);
|
||
|
|
fprintf(fp," sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
|
||
|
|
fprintf(fp," regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);
|
||
|
|
fprintf(fp,"blocks:\n");
|
||
|
|
fprintf(fp," Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
|
||
|
|
fprintf(fp," Maximum size of each dimension of a block: %d x %d x %d\n",
|
||
|
|
deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
|
||
|
|
fprintf(fp," Maximum sizes of each dimension of a grid: %d x %d x %d\n",
|
||
|
|
deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
|
||
|
|
fprintf(fp,"features:\n");
|
||
|
|
fprintf(fp," Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
|
||
|
|
fprintf(fp," multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
|
||
|
|
if (deviceProp.canMapHostMemory){
|
||
|
|
fprintf(fp," canMapHostMemory: TRUE\n");
|
||
|
|
}else{
|
||
|
|
fprintf(fp," canMapHostMemory: FALSE\n");
|
||
|
|
}
|
||
|
|
if (deviceProp.deviceOverlap){
|
||
|
|
fprintf(fp," deviceOverlap: TRUE\n");
|
||
|
|
}else{
|
||
|
|
fprintf(fp," deviceOverlap: FALSE\n");
|
||
|
|
}
|
||
|
|
if (deviceProp.concurrentKernels){
|
||
|
|
fprintf(fp," concurrentKernels: TRUE\n");
|
||
|
|
}else{
|
||
|
|
fprintf(fp," concurrentKernels: FALSE\n");
|
||
|
|
}
|
||
|
|
// outputs initial memory infos via cudaMemGetInfo()
|
||
|
|
double free_db,used_db,total_db;
|
||
|
|
get_free_memory(&free_db,&used_db,&total_db);
|
||
|
|
fprintf(fp,"memory usage:\n");
|
||
|
|
fprintf(fp," rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
|
||
|
|
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
|
||
|
|
|
||
|
|
// closes output file
|
||
|
|
fclose(fp);
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
void finalize_cuda(){
|
||
|
|
cudaDeviceReset();
|
||
|
|
}
|
||
|
|
|
||
|
|
#endif // CUDA_INITIALIZE_H
|