From e7582e50cba3e81fd1c2d229df51ec30497ae81a Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 1 Aug 2025 09:28:06 +0800 Subject: [PATCH] tmp --- src/CMakeLists.txt | 4 +- src/sample/sample8.cu | 398 ++++++++++++++++++------------------------ 2 files changed, 175 insertions(+), 227 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a4c5524..5e9aec2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -40,7 +40,7 @@ set_target_properties(lcg PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJ set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) # 设置编译选项 -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") if(LibLCG_EIGEN) @@ -170,7 +170,7 @@ endif() if(LibLCG_CUDA) # The followings are not working for now due to CUDA 12+ compatibility issues. Check more later - #add_sample(lcg_sample8 sample8.cu) + add_sample(lcg_sample8 sample8.cu) #add_sample(lcg_sample9 sample9.cu) #add_sample(lcg_sample10 sample10.cu) #add_sample(lcg_sample11 sample11.cu) diff --git a/src/sample/sample8.cu b/src/sample/sample8.cu index 651aa12..90a6f8d 100644 --- a/src/sample/sample8.cu +++ b/src/sample/sample8.cu @@ -24,289 +24,237 @@ #include #include #include +#include +#include #include "../lib/lcg_cuda.h" +// ---------- 工具函数 ---------- void read(std::string filePath, int *pN, int *pnz, double **cooVal, - int **cooRowIdx, int **cooColIdx, double **b) + int **cooRowIdx, int **cooColIdx, double **b) { - std::ifstream in(filePath, std::ios::binary); + std::ifstream in(filePath, std::ios::binary); + in.read((char*)pN, sizeof(int)); + in.read((char*)pnz, sizeof(int)); - in.read((char*)pN, sizeof(int)); - in.read((char*)pnz, sizeof(int)); + *cooVal = new double[*pnz]{}; + *cooRowIdx = new int[*pnz]{}; + *cooColIdx = new int[*pnz]{}; + *b = new double[*pN]{}; - *cooVal = new double[*pnz]{}; - *cooRowIdx = new int[*pnz]{}; - *cooColIdx = new int[*pnz]{}; - *b = new double[*pN]{}; - - for (int i = 0; i < *pnz; ++i) - { - in.read((char*)&(*cooRowIdx)[i], sizeof(int)); - in.read((char*)&(*cooColIdx)[i], sizeof(int)); - in.read((char*)&(*cooVal)[i], sizeof(double)); - } - - in.read((char*)(*b), sizeof(double)*(*pN)); - return; + for (int i = 0; i < *pnz; ++i) + { + in.read((char*)&(*cooRowIdx)[i], sizeof(int)); + in.read((char*)&(*cooColIdx)[i], sizeof(int)); + in.read((char*)&(*cooVal)[i], sizeof(double)); + } + in.read((char*)(*b), sizeof(double)*(*pN)); } void readAnswer(std::string filePath, int *pN, double **x) { - std::ifstream in(filePath, std::ios::binary); - - in.read((char*)pN, sizeof(int)); - - *x = new double[*pN]{}; - - in.read((char*)(*x), sizeof(double)*(*pN)); - return; + std::ifstream in(filePath, std::ios::binary); + in.read((char*)pN, sizeof(int)); + *x = new double[*pN]{}; + in.read((char*)(*x), sizeof(double)*(*pN)); } lcg_float avg_error(lcg_float *a, lcg_float *b, int n) { - lcg_float avg = 0.0; - for (size_t i = 0; i < n; i++) - { - avg += (a[i] - b[i])*(a[i] - b[i]); - } - return sqrt(avg)/n; + lcg_float avg = 0.0; + for (int i = 0; i < n; ++i) + avg += (a[i] - b[i])*(a[i] - b[i]); + return sqrt(avg)/n; } -// Declare as global variables -lcg_float one = 1.0; +// ---------- 全局变量 ---------- +lcg_float one = 1.0; lcg_float zero = 0.0; -void *d_buf; -cusparseSpMatDescr_t smat_A; +double *d_A, *d_b, *d_ic, *d_pd; +int *d_rowPtrA, *d_colIdxA; -int *d_rowIdxA; // COO -int *d_rowPtrA; // CSR -int *d_colIdxA; -double *d_A; -double *d_pd; -double *d_ic; +cusparseSpMatDescr_t matA, matL; +cusparseDnVecDescr_t vecTmp; -cusparseMatDescr_t descr_A = 0; -cusparseMatDescr_t descr_L = 0; -csric02Info_t icinfo_A = 0; -csrsv2Info_t info_L = 0; -csrsv2Info_t info_LT = 0; +void *d_buf = nullptr; +size_t bufSize = 0; -void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, - cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size) +// ---------- SpMV ---------- +void cudaAx(void* instance, cublasHandle_t cub, cusparseHandle_t cus, + cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, + const int n, const int nz) { - // Calculate the product of A*x - cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A, - x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf); - return; + cusparseSpMV(cus, CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, matA, x, &zero, prod_Ax, + CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf); } -void cudaMx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, - cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size) +// ---------- SpSV 预条件 ---------- +cusparseSpSVDescr_t spSvDescrL = nullptr; +cusparseSpSVDescr_t spSvDescrLT = nullptr; + +void cudaMx(void* instance, cublasHandle_t cub, cusparseHandle_t cus, + cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, + const int n, const int nz) { - void *d_x, *d_Ax; - cusparseDnVecGetValues(x, &d_x); - cusparseDnVecGetValues(prod_Ax, &d_Ax); + // 预条件器: L * tmp = x => tmp + cusparseSpSV_solve(cus, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, // const void* + matL, // cusparseSpMatDescr_t + x, // cusparseDnVecDescr_t + vecTmp, // cusparseDnVecDescr_t + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + spSvDescrL); // cusparseSpSVDescr_t - cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (double*) d_x, (double*) d_pd, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); + // 预条件器: L^T * prod_Ax = tmp => prod_Ax + cusparseSpSV_solve(cus, + CUSPARSE_OPERATION_TRANSPOSE, + &one, + matL, + vecTmp, + prod_Ax, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + spSvDescrLT); - cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, - n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (double*) d_pd, (double*) d_Ax, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); - return; } -int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, - const lcg_para* param, const int n_size, const int nz_size, const int k) +// ---------- 回调 ---------- +int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, + const lcg_para* param, const int n, const int nz, const int k) { - if (converge <= param->epsilon) { - std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl; - } - return 0; + if (converge <= param->epsilon) + std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl; + return 0; } +// ---------- 主函数 ---------- int main(int argc, char **argv) { - std::string inputPath = "data/case_10K_A"; - std::string answerPath = "data/case_10K_B"; + std::string inputPath = "data/case_10K_A"; + std::string answerPath = "data/case_10K_B"; - int N; - int nz; - double *A; - int *rowIdxA; - int *colIdxA; - double *b; - read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b); + int N, nz; + double *A, *b, *ans_x; + int *rowIdxA, *colIdxA; + read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b); + readAnswer(answerPath, &N, &ans_x); - double *ans_x; - readAnswer(answerPath, &N, &ans_x); + std::clog << "N = " << N << "\nnz = " << nz << std::endl; - std::clog << "N = " << N << std::endl; - std::clog << "nz = " << nz << std::endl; - - // Create handles - cublasHandle_t cubHandle; - cusparseHandle_t cusHandle; + // ---------- 初始化 ---------- + cublasHandle_t cubH; + cusparseHandle_t cusH; + cublasCreate(&cubH); + cusparseCreate(&cusH); - cublasCreate(&cubHandle); - cusparseCreate(&cusHandle); + // ---------- 设备内存 ---------- + cudaMalloc(&d_A, nz*sizeof(double)); + cudaMalloc(&d_rowPtrA, (N+1)*sizeof(int)); + cudaMalloc(&d_colIdxA, nz*sizeof(int)); + cudaMalloc(&d_b, N*sizeof(double)); + cudaMalloc(&d_pd, N*sizeof(double)); - // Allocate GPU memory & copy matrix/vector to device - cudaMalloc(&d_A, nz * sizeof(double)); - cudaMalloc(&d_rowIdxA, nz * sizeof(int)); - cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int)); - cudaMalloc(&d_colIdxA, nz * sizeof(int)); - cudaMalloc(&d_pd, N * sizeof(double)); + // COO -> CSR + thrust::device_vector d_rowIdx(rowIdxA, rowIdxA+nz); + cusparseXcoo2csr(cusH, thrust::raw_pointer_cast(d_rowIdx.data()), + nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO); - cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_A, A, nz*sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_colIdxA, colIdxA, nz*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, b, N*sizeof(double), cudaMemcpyHostToDevice); - // Convert matrix A from COO format to CSR format - cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO); + // ---------- 稀疏矩阵描述 ---------- + cusparseCreateCsr(&matA, N, N, nz, + d_rowPtrA, d_colIdxA, d_A, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); - // Create sparse matrix - cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); + // ---------- 预条件:IC0 ---------- + cudaMalloc(&d_ic, nz*sizeof(double)); + cudaMemcpy(d_ic, d_A, nz*sizeof(double), cudaMemcpyDeviceToDevice); - // This is just used to get bufferSize; - cusparseDnVecDescr_t dvec_tmp; - cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_R_64F); + cusparseCreateCsr(&matL, N, N, nz, + d_rowPtrA, d_colIdxA, d_ic, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); - size_t bufferSize_B; - cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A, - dvec_tmp, &zero, dvec_tmp, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B); + cusparseFillMode_t fill = CUSPARSE_FILL_MODE_LOWER; + cusparseDiagType_t diag = CUSPARSE_DIAG_TYPE_NON_UNIT; + cusparseSpMatSetAttribute(matL, CUSPARSE_SPMAT_FILL_MODE, &fill, sizeof(fill)); + cusparseSpMatSetAttribute(matL, CUSPARSE_SPMAT_DIAG_TYPE, &diag, sizeof(diag)); - // --- Start of the preconditioning part --- + // IC0 分析 + cusparseCreateDnVec(&vecTmp, N, d_pd, CUDA_R_64F); + cusparseSpSV_createDescr(&spSvDescrL); + cusparseSpSV_createDescr(&spSvDescrLT); - // Copy A - cudaMalloc(&d_ic, nz * sizeof(lcg_float)); - cudaMemcpy(d_ic, d_A, nz * sizeof(lcg_float), cudaMemcpyDeviceToDevice); + size_t buf1, buf2; + cusparseSpSV_bufferSize(cusH, CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, matL, vecTmp, vecTmp, CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrL, &buf1); + cusparseSpSV_bufferSize(cusH, CUSPARSE_OPERATION_TRANSPOSE, + &one, matL, vecTmp, vecTmp, CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrLT, &buf2); + bufSize = std::max({buf1, buf2}); + cudaMalloc(&d_buf, bufSize); - int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT; - bufferSize = bufferSize_B; + cusparseSpSV_analysis(cusH, CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, matL, vecTmp, vecTmp, CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrL, d_buf); + cusparseSpSV_analysis(cusH, CUSPARSE_OPERATION_TRANSPOSE, + &one, matL, vecTmp, vecTmp, CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrLT, d_buf); - // create descriptor for matrix A - cusparseCreateMatDescr(&descr_A); + // ---------- 求解 ---------- + lcg_para para = lcg_default_parameters(); + para.epsilon = 1e-6; + para.abs_diff = 0; - // initialize properties of matrix A - cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER); - cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT); - cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO); + std::vector host_x(N); - // create descriptor for matrix L - cusparseCreateMatDescr(&descr_L); - - // initialize properties of matrix L - cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER); - cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT); - cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO); - - // Create empty info objects for incomplete-cholesky factorization - cusparseCreateCsric02Info(&icinfo_A); - cusparseCreateCsrsv2Info(&info_L); - cusparseCreateCsrsv2Info(&info_LT); - - // Compute buffer size in computing ic factorization - cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, - d_colIdxA, icinfo_A, &bufferSize_A); - cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L); - cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, - N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT); - - bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT); - cudaMalloc(&d_buf, bufferSize); - - // Perform incomplete-choleskey factorization: analysis phase - cusparseDcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, - d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); - cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); - cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, - N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); - - // Perform incomplete-choleskey factorization: solve phase - cusparseDcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA, - icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf); - - // --- End of the preconditioning part --- - - // Declare an initial solution - lcg_para self_para = lcg_default_parameters(); - self_para.epsilon = 1e-6; - self_para.abs_diff = 0; - - int ret; - double *host_m = new double[N]; - - // Solve with CG - for (size_t i = 0; i < N; i++) - { - host_m[i] = 0.0; - } - - ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG); + // CG + std::fill(host_x.begin(), host_x.end(), 0.0); + int ret = lcg_solver_cuda(cudaAx, cudaProgress, + host_x.data(), b, N, nz, + ¶, nullptr, cubH, cusH, LCG_CG); lcg_error_str(ret); + std::clog << "CG avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl; - std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl; - - // Solve with CGS - for (size_t i = 0; i < N; i++) - { - host_m[i] = 0.0; - } - - ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS); + // CGS + std::fill(host_x.begin(), host_x.end(), 0.0); + ret = lcg_solver_cuda(cudaAx, cudaProgress, + host_x.data(), b, N, nz, + ¶, nullptr, cubH, cusH, LCG_CGS); lcg_error_str(ret); + std::clog << "CGS avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl; - std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl; - - // Solve with PCG - for (size_t i = 0; i < N; i++) - { - host_m[i] = 0.0; - } - - ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_PCG); + // PCG + /* + std::fill(host_x.begin(), host_x.end(), 0.0); + ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress, + host_x.data(), b, N, nz, + ¶, nullptr, cubH, cusH, LCG_PCG); lcg_error_str(ret); + std::clog << "PCG avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl; + */ - std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl; + // ---------- 清理 ---------- + delete[] A; delete[] rowIdxA; delete[] colIdxA; delete[] b; delete[] ans_x; - // Free Host memory - delete[] A; - delete[] rowIdxA; - delete[] colIdxA; - delete[] b; - delete[] ans_x; - delete[] host_m; + cudaFree(d_A); cudaFree(d_rowPtrA); cudaFree(d_colIdxA); + cudaFree(d_b); cudaFree(d_ic); cudaFree(d_pd); cudaFree(d_buf); - // Free Device memory - cudaFree(d_A); - cudaFree(d_rowIdxA); - cudaFree(d_rowPtrA); - cudaFree(d_colIdxA); - cudaFree(d_pd); - cudaFree(d_ic); + cusparseDestroySpMat(matA); + cusparseDestroySpMat(matL); + cusparseDestroyDnVec(vecTmp); + cusparseSpSV_destroyDescr(spSvDescrL); + cusparseSpSV_destroyDescr(spSvDescrLT); - cusparseDestroyDnVec(dvec_tmp); - cusparseDestroySpMat(smat_A); - cudaFree(d_buf); - - cusparseDestroyMatDescr(descr_A); - cusparseDestroyMatDescr(descr_L); - cusparseDestroyCsric02Info(icinfo_A); - cusparseDestroyCsrsv2Info(info_L); - cusparseDestroyCsrsv2Info(info_LT); - - // Free handles - cublasDestroy(cubHandle); - cusparseDestroy(cusHandle); - - return 0; + cublasDestroy(cubH); + cusparseDestroy(cusH); + return 0; } \ No newline at end of file