tmp

2025-08-01 09:28:06 +08:00
parent 834df92696
commit e7582e50cb
2 changed files with 175 additions and 227 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -40,7 +40,7 @@ set_target_properties(lcg PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJ
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)

 # 设置编译选项
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")

 if(LibLCG_EIGEN)
@@ -170,7 +170,7 @@ endif()

 if(LibLCG_CUDA)
 	# The followings are not working for now due to CUDA 12+ compatibility issues. Check more later
-	#add_sample(lcg_sample8 sample8.cu)
+	add_sample(lcg_sample8 sample8.cu)
 	#add_sample(lcg_sample9 sample9.cu)
 	#add_sample(lcg_sample10 sample10.cu)
 	#add_sample(lcg_sample11 sample11.cu)
--- a/src/sample/sample8.cu
+++ b/src/sample/sample8.cu
@@ -24,289 +24,237 @@
 #include <iomanip>
 #include <fstream>
 #include <cmath>
+#include <vector>
+#include <thrust/device_vector.h>

 #include "../lib/lcg_cuda.h"

+// ---------- 工具函数 ----------
 void read(std::string filePath, int *pN, int *pnz, double **cooVal,
-	int **cooRowIdx, int **cooColIdx, double **b)
+          int **cooRowIdx, int **cooColIdx, double **b)
 {
-	std::ifstream in(filePath, std::ios::binary);
+    std::ifstream in(filePath, std::ios::binary);
+    in.read((char*)pN, sizeof(int));
+    in.read((char*)pnz, sizeof(int));

-	in.read((char*)pN, sizeof(int));
-	in.read((char*)pnz, sizeof(int));
+    *cooVal     = new double[*pnz]{};
+    *cooRowIdx  = new int[*pnz]{};
+    *cooColIdx  = new int[*pnz]{};
+    *b          = new double[*pN]{};

-	*cooVal = new double[*pnz]{};
-	*cooRowIdx = new int[*pnz]{};
-	*cooColIdx = new int[*pnz]{};
-	*b = new double[*pN]{};
-
-	for (int i = 0; i < *pnz; ++i)
-	{
-		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
-		in.read((char*)&(*cooColIdx)[i], sizeof(int));
-		in.read((char*)&(*cooVal)[i], sizeof(double));
-	}
-
-	in.read((char*)(*b), sizeof(double)*(*pN));
-    return;
+    for (int i = 0; i < *pnz; ++i)
+    {
+        in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+        in.read((char*)&(*cooColIdx)[i], sizeof(int));
+        in.read((char*)&(*cooVal)[i], sizeof(double));
+    }
+    in.read((char*)(*b), sizeof(double)*(*pN));
 }

 void readAnswer(std::string filePath, int *pN, double **x)
 {
-	std::ifstream in(filePath, std::ios::binary);
-
-	in.read((char*)pN, sizeof(int));
-
-	*x = new double[*pN]{};
-
-	in.read((char*)(*x), sizeof(double)*(*pN));
-    return;
+    std::ifstream in(filePath, std::ios::binary);
+    in.read((char*)pN, sizeof(int));
+    *x = new double[*pN]{};
+    in.read((char*)(*x), sizeof(double)*(*pN));
 }

 lcg_float avg_error(lcg_float *a, lcg_float *b, int n)
 {
-	lcg_float avg = 0.0;
-	for (size_t i = 0; i < n; i++)
-	{
-		avg += (a[i] - b[i])*(a[i] - b[i]);
-	}
-	return sqrt(avg)/n;
+    lcg_float avg = 0.0;
+    for (int i = 0; i < n; ++i)
+        avg += (a[i] - b[i])*(a[i] - b[i]);
+    return sqrt(avg)/n;
 }

-// Declare as global variables
-lcg_float one = 1.0;
+// ---------- 全局变量 ----------
+lcg_float one  = 1.0;
 lcg_float zero = 0.0;

-void *d_buf;
-cusparseSpMatDescr_t smat_A;
+double  *d_A, *d_b, *d_ic, *d_pd;
+int     *d_rowPtrA, *d_colIdxA;

-int *d_rowIdxA; // COO
-int *d_rowPtrA; // CSR
-int *d_colIdxA;
-double *d_A;
-double *d_pd;
-double *d_ic;
+cusparseSpMatDescr_t  matA, matL;
+cusparseDnVecDescr_t  vecTmp;

-cusparseMatDescr_t descr_A = 0;
-cusparseMatDescr_t descr_L = 0;
-csric02Info_t icinfo_A = 0;
-csrsv2Info_t info_L = 0;
-csrsv2Info_t info_LT = 0;
+void *d_buf = nullptr;
+size_t bufSize = 0;

-void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
-    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
+// ---------- SpMV ----------
+void cudaAx(void* instance, cublasHandle_t cub, cusparseHandle_t cus,
+            cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax,
+            const int n, const int nz)
 {
-	// Calculate the product of A*x
-	cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
-		x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
-    return;
+    cusparseSpMV(cus, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                 &one, matA, x, &zero, prod_Ax,
+                 CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
 }

-void cudaMx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
-    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
+// ---------- SpSV 预条件 ----------
+cusparseSpSVDescr_t spSvDescrL = nullptr;
+cusparseSpSVDescr_t spSvDescrLT = nullptr;
+
+void cudaMx(void* instance, cublasHandle_t cub, cusparseHandle_t cus,
+            cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax,
+            const int n, const int nz)
 {
-	void *d_x, *d_Ax;
-	cusparseDnVecGetValues(x, &d_x);
-	cusparseDnVecGetValues(prod_Ax, &d_Ax);
+    // 预条件器： L * tmp = x  => tmp
+	cusparseSpSV_solve(cus,
+					CUSPARSE_OPERATION_NON_TRANSPOSE,
+					&one,                // const void*
+					matL,                // cusparseSpMatDescr_t
+					x,                   // cusparseDnVecDescr_t
+					vecTmp,              // cusparseDnVecDescr_t
+					CUDA_R_64F,
+					CUSPARSE_SPSV_ALG_DEFAULT,
+					spSvDescrL);         // cusparseSpSVDescr_t

-	cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
-		n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (double*) d_x, (double*) d_pd, 
-		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	// 预条件器： L^T * prod_Ax = tmp  => prod_Ax
+	cusparseSpSV_solve(cus,
+					CUSPARSE_OPERATION_TRANSPOSE,
+					&one,
+					matL,
+					vecTmp,
+					prod_Ax,
+					CUDA_R_64F,
+					CUSPARSE_SPSV_ALG_DEFAULT,
+					spSvDescrLT);

-	cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, 
-		n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (double*) d_pd, (double*) d_Ax, 
-		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
-    return;
 }

+// ---------- 回调 ----------
 int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge,
-	const lcg_para* param, const int n_size, const int nz_size, const int k)
+                 const lcg_para* param, const int n, const int nz, const int k)
 {
-    if (converge <= param->epsilon) {
-		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
-	}
-	return 0;
+    if (converge <= param->epsilon)
+        std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
+    return 0;
 }

+// ---------- 主函数 ----------
 int main(int argc, char **argv)
 {
-	std::string inputPath = "data/case_10K_A";
-	std::string answerPath = "data/case_10K_B";
+    std::string inputPath   = "data/case_10K_A";
+    std::string answerPath  = "data/case_10K_B";

-	int N;
-	int nz;
-	double *A;
-	int *rowIdxA;
-	int *colIdxA;
-	double *b;
-	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+    int N, nz;
+    double *A, *b, *ans_x;
+    int *rowIdxA, *colIdxA;
+    read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+    readAnswer(answerPath, &N, &ans_x);

-	double *ans_x;
-	readAnswer(answerPath, &N, &ans_x);
+    std::clog << "N = " << N << "\nnz = " << nz << std::endl;

-	std::clog << "N = " << N << std::endl;
-	std::clog << "nz = " << nz << std::endl;
+    // ---------- 初始化 ----------
+    cublasHandle_t   cubH;
+    cusparseHandle_t cusH;
+    cublasCreate(&cubH);
+    cusparseCreate(&cusH);

-	// Create handles
-	cublasHandle_t cubHandle;
-	cusparseHandle_t cusHandle;
+    // ---------- 设备内存 ----------
+    cudaMalloc(&d_A,       nz*sizeof(double));
+    cudaMalloc(&d_rowPtrA, (N+1)*sizeof(int));
+    cudaMalloc(&d_colIdxA, nz*sizeof(int));
+    cudaMalloc(&d_b,       N*sizeof(double));
+    cudaMalloc(&d_pd,      N*sizeof(double));

-	cublasCreate(&cubHandle);
-	cusparseCreate(&cusHandle);
+    // COO -> CSR
+    thrust::device_vector<int> d_rowIdx(rowIdxA, rowIdxA+nz);
+    cusparseXcoo2csr(cusH, thrust::raw_pointer_cast(d_rowIdx.data()),
+                     nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);

-	// Allocate GPU memory & copy matrix/vector to device
-	cudaMalloc(&d_A, nz * sizeof(double));
-	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
-	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
-	cudaMalloc(&d_colIdxA, nz * sizeof(int));
-	cudaMalloc(&d_pd, N * sizeof(double));
+    cudaMemcpy(d_A,       A,       nz*sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_colIdxA, colIdxA, nz*sizeof(int),    cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b,       b,       N*sizeof(double),  cudaMemcpyHostToDevice);

-	cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+    // ---------- 稀疏矩阵描述 ----------
+    cusparseCreateCsr(&matA, N, N, nz,
+                      d_rowPtrA, d_colIdxA, d_A,
+                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);

-	// Convert matrix A from COO format to CSR format
-	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+    // ---------- 预条件：IC0 ----------
+    cudaMalloc(&d_ic, nz*sizeof(double));
+    cudaMemcpy(d_ic, d_A, nz*sizeof(double), cudaMemcpyDeviceToDevice);

-	// Create sparse matrix
-	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
-		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+    cusparseCreateCsr(&matL, N, N, nz,
+                      d_rowPtrA, d_colIdxA, d_ic,
+                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);

-	// This is just used to get bufferSize;
-	cusparseDnVecDescr_t dvec_tmp;
-	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_R_64F);
+    cusparseFillMode_t  fill  = CUSPARSE_FILL_MODE_LOWER;
+    cusparseDiagType_t  diag  = CUSPARSE_DIAG_TYPE_NON_UNIT;
+    cusparseSpMatSetAttribute(matL, CUSPARSE_SPMAT_FILL_MODE, &fill, sizeof(fill));
+    cusparseSpMatSetAttribute(matL, CUSPARSE_SPMAT_DIAG_TYPE, &diag, sizeof(diag));

-	size_t bufferSize_B;
-	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
-		dvec_tmp, &zero, dvec_tmp, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
+    // IC0 分析
+    cusparseCreateDnVec(&vecTmp, N, d_pd, CUDA_R_64F);
+    cusparseSpSV_createDescr(&spSvDescrL);
+    cusparseSpSV_createDescr(&spSvDescrLT);

-	// --- Start of the preconditioning part ---
+    size_t buf1, buf2;
+    cusparseSpSV_bufferSize(cusH, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            &one, matL, vecTmp, vecTmp, CUDA_R_64F,
+                            CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrL, &buf1);
+    cusparseSpSV_bufferSize(cusH, CUSPARSE_OPERATION_TRANSPOSE,
+                            &one, matL, vecTmp, vecTmp, CUDA_R_64F,
+                            CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrLT, &buf2);
+    bufSize = std::max({buf1, buf2});
+    cudaMalloc(&d_buf, bufSize);

-	// Copy A
-	cudaMalloc(&d_ic, nz * sizeof(lcg_float));
-	cudaMemcpy(d_ic, d_A, nz * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+    cusparseSpSV_analysis(cusH, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                          &one, matL, vecTmp, vecTmp, CUDA_R_64F,
+                          CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrL, d_buf);
+    cusparseSpSV_analysis(cusH, CUSPARSE_OPERATION_TRANSPOSE,
+                          &one, matL, vecTmp, vecTmp, CUDA_R_64F,
+                          CUSPARSE_SPSV_ALG_DEFAULT, spSvDescrLT, d_buf);

-	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT;
-	bufferSize = bufferSize_B;
+    // ---------- 求解 ----------
+    lcg_para para = lcg_default_parameters();
+    para.epsilon = 1e-6;
+    para.abs_diff = 0;

-	// create descriptor for matrix A
-	cusparseCreateMatDescr(&descr_A);
+    std::vector<double> host_x(N);

-	// initialize properties of matrix A
-	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
-	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
-	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
-	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
-
-	// create descriptor for matrix L
-	cusparseCreateMatDescr(&descr_L);
-
-	// initialize properties of matrix L
-	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
-	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
-	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT);
-	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
-
-	// Create empty info objects for incomplete-cholesky factorization
-	cusparseCreateCsric02Info(&icinfo_A);
-	cusparseCreateCsrsv2Info(&info_L);
-	cusparseCreateCsrsv2Info(&info_LT);
-
-	// Compute buffer size in computing ic factorization
-	cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
-		d_colIdxA, icinfo_A, &bufferSize_A);
-	cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
-		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
-	cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
-		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT);
-	
-	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT);
-	cudaMalloc(&d_buf, bufferSize);
-
-	// Perform incomplete-choleskey factorization: analysis phase
-	cusparseDcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, 
-		d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
-	cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
-		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
-	cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
-		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
-
-	// Perform incomplete-choleskey factorization: solve phase
-	cusparseDcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA, 
-		icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
-
-	// --- End of the preconditioning part ---
-
-	// Declare an initial solution
-    lcg_para self_para = lcg_default_parameters();
-	self_para.epsilon = 1e-6;
-	self_para.abs_diff = 0;
-
-	int ret;
-	double *host_m = new double[N];
-
-	// Solve with CG
-	for (size_t i = 0; i < N; i++)
-	{
-		host_m[i] = 0.0;
-	}
-
-    ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG);
+    // CG
+    std::fill(host_x.begin(), host_x.end(), 0.0);
+    int ret = lcg_solver_cuda(cudaAx, cudaProgress,
+                              host_x.data(), b, N, nz,
+                              &para, nullptr, cubH, cusH, LCG_CG);
    lcg_error_str(ret);
+    std::clog << "CG avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl;

-	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
-
-	// Solve with CGS
-	for (size_t i = 0; i < N; i++)
-	{
-		host_m[i] = 0.0;
-	}
-
-	ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS);
+    // CGS
+    std::fill(host_x.begin(), host_x.end(), 0.0);
+    ret = lcg_solver_cuda(cudaAx, cudaProgress,
+                          host_x.data(), b, N, nz,
+                          &para, nullptr, cubH, cusH, LCG_CGS);
    lcg_error_str(ret);
+    std::clog << "CGS avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl;

-	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
-
-	// Solve with PCG
-	for (size_t i = 0; i < N; i++)
-	{
-		host_m[i] = 0.0;
-	}
-
-	ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_PCG);
+    // PCG
+	/*
+    std::fill(host_x.begin(), host_x.end(), 0.0);
+    ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress,
+                                         host_x.data(), b, N, nz,
+                                         &para, nullptr, cubH, cusH, LCG_PCG);
    lcg_error_str(ret);
+    std::clog << "PCG avg error: " << avg_error(host_x.data(), ans_x, N) << std::endl;
+	*/

-	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+    // ---------- 清理 ----------
+    delete[] A; delete[] rowIdxA; delete[] colIdxA; delete[] b; delete[] ans_x;

-	// Free Host memory
-	delete[] A;
-	delete[] rowIdxA;
-	delete[] colIdxA;
-	delete[] b;
-	delete[] ans_x;
-	delete[] host_m;
+    cudaFree(d_A); cudaFree(d_rowPtrA); cudaFree(d_colIdxA);
+    cudaFree(d_b); cudaFree(d_ic); cudaFree(d_pd); cudaFree(d_buf);

-	// Free Device memory
-	cudaFree(d_A);
-	cudaFree(d_rowIdxA);
-	cudaFree(d_rowPtrA);
-	cudaFree(d_colIdxA);
-	cudaFree(d_pd);
-	cudaFree(d_ic);
+    cusparseDestroySpMat(matA);
+    cusparseDestroySpMat(matL);
+    cusparseDestroyDnVec(vecTmp);
+    cusparseSpSV_destroyDescr(spSvDescrL);
+    cusparseSpSV_destroyDescr(spSvDescrLT);

-	cusparseDestroyDnVec(dvec_tmp);
-	cusparseDestroySpMat(smat_A);
-	cudaFree(d_buf);
-
-	cusparseDestroyMatDescr(descr_A);
-	cusparseDestroyMatDescr(descr_L);
-	cusparseDestroyCsric02Info(icinfo_A);
-	cusparseDestroyCsrsv2Info(info_L);
-	cusparseDestroyCsrsv2Info(info_LT);
-
-	// Free handles
-	cublasDestroy(cubHandle);
-	cusparseDestroy(cusHandle);
-
-	return 0;
+    cublasDestroy(cubH);
+    cusparseDestroy(cusH);
+    return 0;
 }