This commit is contained in:
张壹 2025-08-01 09:28:06 +08:00
parent 834df92696
commit e7582e50cb
2 changed files with 175 additions and 227 deletions

View File

@ -40,7 +40,7 @@ set_target_properties(lcg PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJ
set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
# #
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
if(LibLCG_EIGEN) if(LibLCG_EIGEN)
@ -170,7 +170,7 @@ endif()
if(LibLCG_CUDA) if(LibLCG_CUDA)
# The followings are not working for now due to CUDA 12+ compatibility issues. Check more later # The followings are not working for now due to CUDA 12+ compatibility issues. Check more later
#add_sample(lcg_sample8 sample8.cu) add_sample(lcg_sample8 sample8.cu)
#add_sample(lcg_sample9 sample9.cu) #add_sample(lcg_sample9 sample9.cu)
#add_sample(lcg_sample10 sample10.cu) #add_sample(lcg_sample10 sample10.cu)
#add_sample(lcg_sample11 sample11.cu) #add_sample(lcg_sample11 sample11.cu)

View File

@ -24,289 +24,237 @@
#include <iomanip> #include <iomanip>
#include <fstream> #include <fstream>
#include <cmath> #include <cmath>
#include <vector>
#include <thrust/device_vector.h>
#include "../lib/lcg_cuda.h" #include "../lib/lcg_cuda.h"
// ---------- 工具函数 ----------
void read(std::string filePath, int *pN, int *pnz, double **cooVal, void read(std::string filePath, int *pN, int *pnz, double **cooVal,
int **cooRowIdx, int **cooColIdx, double **b) int **cooRowIdx, int **cooColIdx, double **b)
{ {
std::ifstream in(filePath, std::ios::binary); std::ifstream in(filePath, std::ios::binary);
in.read((char*)pN, sizeof(int));
in.read((char*)pnz, sizeof(int));
in.read((char*)pN, sizeof(int)); *cooVal = new double[*pnz]{};
in.read((char*)pnz, sizeof(int)); *cooRowIdx = new int[*pnz]{};
*cooColIdx = new int[*pnz]{};
*b = new double[*pN]{};
*cooVal = new double[*pnz]{}; for (int i = 0; i < *pnz; ++i)
*cooRowIdx = new int[*pnz]{}; {
*cooColIdx = new int[*pnz]{}; in.read((char*)&(*cooRowIdx)[i], sizeof(int));
*b = new double[*pN]{}; in.read((char*)&(*cooColIdx)[i], sizeof(int));
in.read((char*)&(*cooVal)[i], sizeof(double));
for (int i = 0; i < *pnz; ++i) }
{ in.read((char*)(*b), sizeof(double)*(*pN));
in.read((char*)&(*cooRowIdx)[i], sizeof(int));
in.read((char*)&(*cooColIdx)[i], sizeof(int));
in.read((char*)&(*cooVal)[i], sizeof(double));
}
in.read((char*)(*b), sizeof(double)*(*pN));
return;
} }
void readAnswer(std::string filePath, int *pN, double **x) void readAnswer(std::string filePath, int *pN, double **x)
{ {
std::ifstream in(filePath, std::ios::binary); std::ifstream in(filePath, std::ios::binary);
in.read((char*)pN, sizeof(int));
in.read((char*)pN, sizeof(int)); *x = new double[*pN]{};
in.read((char*)(*x), sizeof(double)*(*pN));
*x = new double[*pN]{};
in.read((char*)(*x), sizeof(double)*(*pN));
return;
} }
lcg_float avg_error(lcg_float *a, lcg_float *b, int n) lcg_float avg_error(lcg_float *a, lcg_float *b, int n)
{ {
lcg_float avg = 0.0; lcg_float avg = 0.0;
for (size_t i = 0; i < n; i++) for (int i = 0; i < n; ++i)
{ avg += (a[i] - b[i])*(a[i] - b[i]);
avg += (a[i] - b[i])*(a[i] - b[i]); return sqrt(avg)/n;
}
return sqrt(avg)/n;
} }
// Declare as global variables // ---------- 全局变量 ----------
lcg_float one = 1.0; lcg_float one = 1.0;
lcg_float zero = 0.0; lcg_float zero = 0.0;
void *d_buf; double *d_A, *d_b, *d_ic, *d_pd;
cusparseSpMatDescr_t smat_A; int *d_rowPtrA, *d_colIdxA;
int *d_rowIdxA; // COO cusparseSpMatDescr_t matA, matL;
int *d_rowPtrA; // CSR cusparseDnVecDescr_t vecTmp;
int *d_colIdxA;
double *d_A;
double *d_pd;
double *d_ic;
cusparseMatDescr_t descr_A = 0; void *d_buf = nullptr;
cusparseMatDescr_t descr_L = 0; size_t bufSize = 0;
csric02Info_t icinfo_A = 0;
csrsv2Info_t info_L = 0;
csrsv2Info_t info_LT = 0;
void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, // ---------- SpMV ----------
cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size) void cudaAx(void* instance, cublasHandle_t cub, cusparseHandle_t cus,
cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax,
const int n, const int nz)
{ {
// Calculate the product of A*x cusparseSpMV(cus, CUSPARSE_OPERATION_NON_TRANSPOSE,
cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A, &one, matA, x, &zero, prod_Ax,
x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf); CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
return;
} }
void cudaMx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, // ---------- SpSV 预条件 ----------
cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size) cusparseSpSVDescr_t spSvDescrL = nullptr;
cusparseSpSVDescr_t spSvDescrLT = nullptr;
void cudaMx(void* instance, cublasHandle_t cub, cusparseHandle_t cus,
cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax,
const int n, const int nz)
{ {
void *d_x, *d_Ax; // 预条件器: L * tmp = x => tmp
cusparseDnVecGetValues(x, &d_x); cusparseSpSV_solve(cus,
cusparseDnVecGetValues(prod_Ax, &d_Ax); CUSPARSE_OPERATION_NON_TRANSPOSE,
&one, // const void*
matL, // cusparseSpMatDescr_t
x, // cusparseDnVecDescr_t
vecTmp, // cusparseDnVecDescr_t
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSvDescrL); // cusparseSpSVDescr_t
cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, // 预条件器: L^T * prod_Ax = tmp => prod_Ax
n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (double*) d_x, (double*) d_pd, cusparseSpSV_solve(cus,
CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); CUSPARSE_OPERATION_TRANSPOSE,
&one,
matL,
vecTmp,
prod_Ax,
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSvDescrLT);
cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE,
n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (double*) d_pd, (double*) d_Ax,
CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
return;
} }
int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, // ---------- 回调 ----------
const lcg_para* param, const int n_size, const int nz_size, const int k) int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge,
const lcg_para* param, const int n, const int nz, const int k)
{ {
if (converge <= param->epsilon) { if (converge <= param->epsilon)
std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl; std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
} return 0;
return 0;
} }
// ---------- 主函数 ----------
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
std::string inputPath = "data/case_10K_A"; std::string inputPath = "data/case_10K_A";
std::string answerPath = "data/case_10K_B"; std::string answerPath = "data/case_10K_B";
int N; int N, nz;
int nz; double *A, *b, *ans_x;
double *A; int *rowIdxA, *colIdxA;
int *rowIdxA; read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
int *colIdxA; readAnswer(answerPath, &N, &ans_x);
double *b;
read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
double *ans_x; std::clog << "N = " << N << "\nnz = " << nz << std::endl;
readAnswer(answerPath, &N, &ans_x);
std::clog << "N = " << N << std::endl; // ---------- 初始化 ----------
std::clog << "nz = " << nz << std::endl; cublasHandle_t cubH;
cusparseHandle_t cusH;
// Create handles cublasCreate(&cubH);
cublasHandle_t cubHandle; cusparseCreate(&cusH);
cusparseHandle_t cusHandle;
cublasCreate(&cubHandle); // ---------- 设备内存 ----------
cusparseCreate(&cusHandle); cudaMalloc(&d_A, nz*sizeof(double));
cudaMalloc(&d_rowPtrA, (N+1)*sizeof(int));
cudaMalloc(&d_colIdxA, nz*sizeof(int));
cudaMalloc(&d_b, N*sizeof(double));
cudaMalloc(&d_pd, N*sizeof(double));
// Allocate GPU memory & copy matrix/vector to device // COO -> CSR
cudaMalloc(&d_A, nz * sizeof(double)); thrust::device_vector<int> d_rowIdx(rowIdxA, rowIdxA+nz);
cudaMalloc(&d_rowIdxA, nz * sizeof(int)); cusparseXcoo2csr(cusH, thrust::raw_pointer_cast(d_rowIdx.data()),
cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int)); nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
cudaMalloc(&d_colIdxA, nz * sizeof(int));
cudaMalloc(&d_pd, N * sizeof(double));
cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_A, A, nz*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_colIdxA, colIdxA, nz*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, N*sizeof(double), cudaMemcpyHostToDevice);
// Convert matrix A from COO format to CSR format // ---------- 稀疏矩阵描述 ----------
cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO); cusparseCreateCsr(&matA, N, N, nz,
d_rowPtrA, d_colIdxA, d_A,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
// Create sparse matrix // ---------- 预条件IC0 ----------
cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I, cudaMalloc(&d_ic, nz*sizeof(double));
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); cudaMemcpy(d_ic, d_A, nz*sizeof(double), cudaMemcpyDeviceToDevice);
// This is just used to get bufferSize; cusparseCreateCsr(&matL, N, N, nz,
cusparseDnVecDescr_t dvec_tmp; d_rowPtrA, d_colIdxA, d_ic,
cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_R_64F); CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
size_t bufferSize_B; cusparseFillMode_t fill = CUSPARSE_FILL_MODE_LOWER;
cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A, cusparseDiagType_t diag = CUSPARSE_DIAG_TYPE_NON_UNIT;
dvec_tmp, &zero, dvec_tmp, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B); cusparseSpMatSetAttribute(matL, CUSPARSE_SPMAT_FILL_MODE, &fill, sizeof(fill));
cusparseSpMatSetAttribute(matL, CUSPARSE_SPMAT_DIAG_TYPE, &diag, sizeof(diag));
// --- Start of the preconditioning part --- // IC0 分析
cusparseCreateDnVec(&vecTmp, N, d_pd, CUDA_R_64F);
cusparseSpSV_createDescr(&spSvDescrL);
cusparseSpSV_createDescr(&spSvDescrLT);
// Copy A size_t buf1, buf2;
cudaMalloc(&d_ic, nz * sizeof(lcg_float)); cusparseSpSV_bufferSize(cusH, CUSPARSE_OPERATION_NON_TRANSPOSE,
cudaMemcpy(d_ic, d_A, nz * sizeof(lcg_float), cudaMemcpyDeviceToDevice); &one, matL, vecTmp, vecTmp, CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrL, &buf1);
cusparseSpSV_bufferSize(cusH, CUSPARSE_OPERATION_TRANSPOSE,
&one, matL, vecTmp, vecTmp, CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrLT, &buf2);
bufSize = std::max({buf1, buf2});
cudaMalloc(&d_buf, bufSize);
int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT; cusparseSpSV_analysis(cusH, CUSPARSE_OPERATION_NON_TRANSPOSE,
bufferSize = bufferSize_B; &one, matL, vecTmp, vecTmp, CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrL, d_buf);
cusparseSpSV_analysis(cusH, CUSPARSE_OPERATION_TRANSPOSE,
&one, matL, vecTmp, vecTmp, CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrLT, d_buf);
// create descriptor for matrix A // ---------- 求解 ----------
cusparseCreateMatDescr(&descr_A); lcg_para para = lcg_default_parameters();
para.epsilon = 1e-6;
para.abs_diff = 0;
// initialize properties of matrix A std::vector<double> host_x(N);
cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
// create descriptor for matrix L // CG
cusparseCreateMatDescr(&descr_L); std::fill(host_x.begin(), host_x.end(), 0.0);
int ret = lcg_solver_cuda(cudaAx, cudaProgress,
// initialize properties of matrix L host_x.data(), b, N, nz,
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL); &para, nullptr, cubH, cusH, LCG_CG);
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT);
cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
// Create empty info objects for incomplete-cholesky factorization
cusparseCreateCsric02Info(&icinfo_A);
cusparseCreateCsrsv2Info(&info_L);
cusparseCreateCsrsv2Info(&info_LT);
// Compute buffer size in computing ic factorization
cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA,
d_colIdxA, icinfo_A, &bufferSize_A);
cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE,
N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT);
bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT);
cudaMalloc(&d_buf, bufferSize);
// Perform incomplete-choleskey factorization: analysis phase
cusparseDcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA,
d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE,
N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
// Perform incomplete-choleskey factorization: solve phase
cusparseDcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA,
icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
// --- End of the preconditioning part ---
// Declare an initial solution
lcg_para self_para = lcg_default_parameters();
self_para.epsilon = 1e-6;
self_para.abs_diff = 0;
int ret;
double *host_m = new double[N];
// Solve with CG
for (size_t i = 0; i < N; i++)
{
host_m[i] = 0.0;
}
ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG);
lcg_error_str(ret); lcg_error_str(ret);
std::clog << "CG avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl;
std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl; // CGS
std::fill(host_x.begin(), host_x.end(), 0.0);
// Solve with CGS ret = lcg_solver_cuda(cudaAx, cudaProgress,
for (size_t i = 0; i < N; i++) host_x.data(), b, N, nz,
{ &para, nullptr, cubH, cusH, LCG_CGS);
host_m[i] = 0.0;
}
ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS);
lcg_error_str(ret); lcg_error_str(ret);
std::clog << "CGS avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl;
std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl; // PCG
/*
// Solve with PCG std::fill(host_x.begin(), host_x.end(), 0.0);
for (size_t i = 0; i < N; i++) ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress,
{ host_x.data(), b, N, nz,
host_m[i] = 0.0; &para, nullptr, cubH, cusH, LCG_PCG);
}
ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_PCG);
lcg_error_str(ret); lcg_error_str(ret);
std::clog << "PCG avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl;
*/
std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl; // ---------- 清理 ----------
delete[] A; delete[] rowIdxA; delete[] colIdxA; delete[] b; delete[] ans_x;
// Free Host memory cudaFree(d_A); cudaFree(d_rowPtrA); cudaFree(d_colIdxA);
delete[] A; cudaFree(d_b); cudaFree(d_ic); cudaFree(d_pd); cudaFree(d_buf);
delete[] rowIdxA;
delete[] colIdxA;
delete[] b;
delete[] ans_x;
delete[] host_m;
// Free Device memory cusparseDestroySpMat(matA);
cudaFree(d_A); cusparseDestroySpMat(matL);
cudaFree(d_rowIdxA); cusparseDestroyDnVec(vecTmp);
cudaFree(d_rowPtrA); cusparseSpSV_destroyDescr(spSvDescrL);
cudaFree(d_colIdxA); cusparseSpSV_destroyDescr(spSvDescrLT);
cudaFree(d_pd);
cudaFree(d_ic);
cusparseDestroyDnVec(dvec_tmp); cublasDestroy(cubH);
cusparseDestroySpMat(smat_A); cusparseDestroy(cusH);
cudaFree(d_buf); return 0;
cusparseDestroyMatDescr(descr_A);
cusparseDestroyMatDescr(descr_L);
cusparseDestroyCsric02Info(icinfo_A);
cusparseDestroyCsrsv2Info(info_L);
cusparseDestroyCsrsv2Info(info_LT);
// Free handles
cublasDestroy(cubHandle);
cusparseDestroy(cusHandle);
return 0;
} }