initial upload

2024-09-11 13:39:28 +08:00
parent c7e8487a02
commit 834df92696
68 changed files with 21889 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 # ---> C++
 # Prerequisites
 *.d
@@ -32,3 +31,12 @@
 *.out
 *.app
 # folder preferences and build folder
 .DS_Store
 build/
 pack/
 .vscode/
 out/
 *.sh
 case_*
 config.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,30 @@
 cmake_minimum_required(VERSION 3.15.2)
 # 设置工程名称
 project(LibLCG VERSION 3.1 LANGUAGES CXX)
 # 添加配置配件编写的函数
 include(CMakePackageConfigHelpers)
 message(STATUS "Platform: " ${CMAKE_HOST_SYSTEM_NAME})
 # CMake默认的安装路径 Windows下为C:/Program\ Files/${Project_Name} Linux/Unix下为/usr/local
 message(STATUS "Install prefix: " ${CMAKE_INSTALL_PREFIX})
 # CMake默认的变异类型为空
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 # 添加编译选项
 option(LibLCG_OPENMP "Use OpenMP" ON) # Set OFF to disable the functionality 
 option(LibLCG_EIGEN "Use Eigen" ON)
 option(LibLCG_STD_COMPLEX "Use STD complex" ON)
 option(LibLCG_CUDA "Use CUDA" ON)
 message(STATUS "Use OpenMP: " ${LibLCG_OPENMP})
 message(STATUS "Use Eigen: " ${LibLCG_EIGEN})
 message(STATUS "Use STD complex: " ${LibLCG_STD_COMPLEX})
 message(STATUS "Use CUDA: " ${LibLCG_CUDA})
 # 加入一个头文件配置，让cmake对源码进行操作
 configure_file(
 	"${PROJECT_SOURCE_DIR}/config.h.in"
 	"${PROJECT_SOURCE_DIR}/src/lib/config.h"
 	)
 # 添加源文件地址
 add_subdirectory(src/)
--- a/2537
+++ b/2537
--- a/524
+++ b/524
@@ -0,0 +1,524 @@
 LibLCG License
 --------------
 LibLCG is distributed under a dual licensing scheme. You can
 redistribute it and/or modify it under the terms of the GNU Lesser
 General Public License (LGPL) as published by the Free Software 
 Foundation, either version 2 of the License, or (at your option) any 
 later version. A copy of the GNU Lesser General Public License is 
 reproduced below.
 If the terms and conditions of the LGPL v.2. would prevent you from
 using the LibLCG, please consider the option to obtain a commercial
 license for a fee. These licenses are offered by the LibLCG developing 
 team. As a rule, licenses are provided "as-is", unlimited in time for 
 a one time fee. Please send corresponding requests to:
 yizhang-geo@zju.edu.cn. Please do not forget to include some
 description of your company and the realm of its activities. Also add 
 information on how to contact you by electronic and paper mail.
 =====================================================================
                  GNU LESSER GENERAL PUBLIC LICENSE
                       Version 2.1, February 1999
 Copyright (C) 1991, 1999 Free Software Foundation, Inc.
 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
 [This is the first released version of the Lesser GPL.  It also counts
 as the successor of the GNU Library Public License, version 2, hence
 the version number 2.1.]
                            Preamble
  The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
 Licenses are intended to guarantee your freedom to share and change
 free software--to make sure the software is free for all its users.
  This license, the Lesser General Public License, applies to some
 specially designated software packages--typically libraries--of the
 Free Software Foundation and other authors who decide to use it.  You
 can use it too, but we suggest you first think carefully about whether
 this license or the ordinary General Public License is the better
 strategy to use in any particular case, based on the explanations below.
  When we speak of free software, we are referring to freedom of use,
 not price.  Our General Public Licenses are designed to make sure that
 you have the freedom to distribute copies of free software (and charge
 for this service if you wish); that you receive source code or can get
 it if you want it; that you can change the software and use pieces of
 it in new free programs; and that you are informed that you can do
 these things.
  To protect your rights, we need to make restrictions that forbid
 distributors to deny you these rights or to ask you to surrender these
 rights.  These restrictions translate to certain responsibilities for
 you if you distribute copies of the library or if you modify it.
  For example, if you distribute copies of the library, whether gratis
 or for a fee, you must give the recipients all the rights that we gave
 you.  You must make sure that they, too, receive or can get the source
 code.  If you link other code with the library, you must provide
 complete object files to the recipients, so that they can relink them
 with the library after making changes to the library and recompiling
 it.  And you must show them these terms so they know their rights.
  We protect your rights with a two-step method: (1) we copyright the
 library, and (2) we offer you this license, which gives you legal
 permission to copy, distribute and/or modify the library.
  To protect each distributor, we want to make it very clear that
 there is no warranty for the free library.  Also, if the library is
 modified by someone else and passed on, the recipients should know
 that what they have is not the original version, so that the original
 author's reputation will not be affected by problems that might be
 introduced by others.
  Finally, software patents pose a constant threat to the existence of
 any free program.  We wish to make sure that a company cannot
 effectively restrict the users of a free program by obtaining a
 restrictive license from a patent holder.  Therefore, we insist that
 any patent license obtained for a version of the library must be
 consistent with the full freedom of use specified in this license.
  Most GNU software, including some libraries, is covered by the
 ordinary GNU General Public License.  This license, the GNU Lesser
 General Public License, applies to certain designated libraries, and
 is quite different from the ordinary General Public License.  We use
 this license for certain libraries in order to permit linking those
 libraries into non-free programs.
  When a program is linked with a library, whether statically or using
 a shared library, the combination of the two is legally speaking a
 combined work, a derivative of the original library.  The ordinary
 General Public License therefore permits such linking only if the
 entire combination fits its criteria of freedom.  The Lesser General
 Public License permits more lax criteria for linking other code with
 the library.
  We call this license the "Lesser" General Public License because it
 does Less to protect the user's freedom than the ordinary General
 Public License.  It also provides other free software developers Less
 of an advantage over competing non-free programs.  These disadvantages
 are the reason we use the ordinary General Public License for many
 libraries.  However, the Lesser license provides advantages in certain
 special circumstances.
  For example, on rare occasions, there may be a special need to
 encourage the widest possible use of a certain library, so that it becomes
 a de-facto standard.  To achieve this, non-free programs must be
 allowed to use the library.  A more frequent case is that a free
 library does the same job as widely used non-free libraries.  In this
 case, there is little to gain by limiting the free library to free
 software only, so we use the Lesser General Public License.
  In other cases, permission to use a particular library in non-free
 programs enables a greater number of people to use a large body of
 free software.  For example, permission to use the GNU C Library in
 non-free programs enables many more people to use the whole GNU
 operating system, as well as its variant, the GNU/Linux operating
 system.
  Although the Lesser General Public License is Less protective of the
 users' freedom, it does ensure that the user of a program that is
 linked with the Library has the freedom and the wherewithal to run
 that program using a modified version of the Library.
  The precise terms and conditions for copying, distribution and
 modification follow.  Pay close attention to the difference between a
 "work based on the library" and a "work that uses the library".  The
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
                  GNU LESSER GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
  0. This License Agreement applies to any software library or other
 program which contains a notice placed by the copyright holder or
 other authorized party saying it may be distributed under the terms of
 this Lesser General Public License (also called "this License").
 Each licensee is addressed as "you".
  A "library" means a collection of software functions and/or data
 prepared so as to be conveniently linked with application programs
 (which use some of those functions and data) to form executables.
  The "Library", below, refers to any such software library or work
 which has been distributed under these terms.  A "work based on the
 Library" means either the Library or any derivative work under
 copyright law: that is to say, a work containing the Library or a
 portion of it, either verbatim or with modifications and/or translated
 straightforwardly into another language.  (Hereinafter, translation is
 included without limitation in the term "modification".)
  "Source code" for a work means the preferred form of the work for
 making modifications to it.  For a library, complete source code means
 all the source code for all modules it contains, plus any associated
 interface definition files, plus the scripts used to control compilation
 and installation of the library.
  Activities other than copying, distribution and modification are not
 covered by this License; they are outside its scope.  The act of
 running a program using the Library is not restricted, and output from
 such a program is covered only if its contents constitute a work based
 on the Library (independent of the use of the Library in a tool for
 writing it).  Whether that is true depends on what the Library does
 and what the program that uses the Library does.
  1. You may copy and distribute verbatim copies of the Library's
 complete source code as you receive it, in any medium, provided that
 you conspicuously and appropriately publish on each copy an
 appropriate copyright notice and disclaimer of warranty; keep intact
 all the notices that refer to this License and to the absence of any
 warranty; and distribute a copy of this License along with the
 Library.
  You may charge a fee for the physical act of transferring a copy,
 and you may at your option offer warranty protection in exchange for a
 fee.
  2. You may modify your copy or copies of the Library or any portion
 of it, thus forming a work based on the Library, and copy and
 distribute such modifications or work under the terms of Section 1
 above, provided that you also meet all of these conditions:
    a) The modified work must itself be a software library.
    b) You must cause the files modified to carry prominent notices
    stating that you changed the files and the date of any change.
    c) You must cause the whole of the work to be licensed at no
    charge to all third parties under the terms of this License.
    d) If a facility in the modified Library refers to a function or a
    table of data to be supplied by an application program that uses
    the facility, other than as an argument passed when the facility
    is invoked, then you must make a good faith effort to ensure that,
    in the event an application does not supply such function or
    table, the facility still operates, and performs whatever part of
    its purpose remains meaningful.
    (For example, a function in a library to compute square roots has
    a purpose that is entirely well-defined independent of the
    application.  Therefore, Subsection 2d requires that any
    application-supplied function or table used by this function must
    be optional: if the application does not supply it, the square
    root function must still compute square roots.)
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Library,
 and can be reasonably considered independent and separate works in
 themselves, then this License, and its terms, do not apply to those
 sections when you distribute them as separate works.  But when you
 distribute the same sections as part of a whole which is a work based
 on the Library, the distribution of the whole must be on the terms of
 this License, whose permissions for other licensees extend to the
 entire whole, and thus to each and every part regardless of who wrote
 it.
 Thus, it is not the intent of this section to claim rights or contest
 your rights to work written entirely by you; rather, the intent is to
 exercise the right to control the distribution of derivative or
 collective works based on the Library.
 In addition, mere aggregation of another work not based on the Library
 with the Library (or with a work based on the Library) on a volume of
 a storage or distribution medium does not bring the other work under
 the scope of this License.
  3. You may opt to apply the terms of the ordinary GNU General Public
 License instead of this License to a given copy of the Library.  To do
 this, you must alter all the notices that refer to this License, so
 that they refer to the ordinary GNU General Public License, version 2,
 instead of to this License.  (If a newer version than version 2 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
  Once this change is made in a given copy, it is irreversible for
 that copy, so the ordinary GNU General Public License applies to all
 subsequent copies and derivative works made from that copy.
  This option is useful when you wish to copy part of the code of
 the Library into a program that is not a library.
  4. You may copy and distribute the Library (or a portion or
 derivative of it, under Section 2) in object code or executable form
 under the terms of Sections 1 and 2 above provided that you accompany
 it with the complete corresponding machine-readable source code, which
 must be distributed under the terms of Sections 1 and 2 above on a
 medium customarily used for software interchange.
  If distribution of object code is made by offering access to copy
 from a designated place, then offering equivalent access to copy the
 source code from the same place satisfies the requirement to
 distribute the source code, even though third parties are not
 compelled to copy the source along with the object code.
  5. A program that contains no derivative of any portion of the
 Library, but is designed to work with the Library by being compiled or
 linked with it, is called a "work that uses the Library".  Such a
 work, in isolation, is not a derivative work of the Library, and
 therefore falls outside the scope of this License.
  However, linking a "work that uses the Library" with the Library
 creates an executable that is a derivative of the Library (because it
 contains portions of the Library), rather than a "work that uses the
 library".  The executable is therefore covered by this License.
 Section 6 states terms for distribution of such executables.
  When a "work that uses the Library" uses material from a header file
 that is part of the Library, the object code for the work may be a
 derivative work of the Library even though the source code is not.
 Whether this is true is especially significant if the work can be
 linked without the Library, or if the work is itself a library.  The
 threshold for this to be true is not precisely defined by law.
  If such an object file uses only numerical parameters, data
 structure layouts and accessors, and small macros and small inline
 functions (ten lines or less in length), then the use of the object
 file is unrestricted, regardless of whether it is legally a derivative
 work.  (Executables containing this object code plus portions of the
 Library will still fall under Section 6.)
  Otherwise, if the work is a derivative of the Library, you may
 distribute the object code for the work under the terms of Section 6.
 Any executables containing that work also fall under Section 6,
 whether or not they are linked directly with the Library itself.
  6. As an exception to the Sections above, you may also combine or
 link a "work that uses the Library" with the Library to produce a
 work containing portions of the Library, and distribute that work
 under terms of your choice, provided that the terms permit
 modification of the work for the customer's own use and reverse
 engineering for debugging such modifications.
  You must give prominent notice with each copy of the work that the
 Library is used in it and that the Library and its use are covered by
 this License.  You must supply a copy of this License.  If the work
 during execution displays copyright notices, you must include the
 copyright notice for the Library among them, as well as a reference
 directing the user to the copy of this License.  Also, you must do one
 of these things:
    a) Accompany the work with the complete corresponding
    machine-readable source code for the Library including whatever
    changes were used in the work (which must be distributed under
    Sections 1 and 2 above); and, if the work is an executable linked
    with the Library, with the complete machine-readable "work that
    uses the Library", as object code and/or source code, so that the
    user can modify the Library and then relink to produce a modified
    executable containing the modified Library.  (It is understood
    that the user who changes the contents of definitions files in the
    Library will not necessarily be able to recompile the application
    to use the modified definitions.)
    b) Use a suitable shared library mechanism for linking with the
    Library.  A suitable mechanism is one that (1) uses at run time a
    copy of the library already present on the user's computer system,
    rather than copying library functions into the executable, and (2)
    will operate properly with a modified version of the library, if
    the user installs one, as long as the modified version is
    interface-compatible with the version that the work was made with.
    c) Accompany the work with a written offer, valid for at
    least three years, to give the same user the materials
    specified in Subsection 6a, above, for a charge no more
    than the cost of performing this distribution.
    d) If distribution of the work is made by offering access to copy
    from a designated place, offer equivalent access to copy the above
    specified materials from the same place.
    e) Verify that the user has already received a copy of these
    materials or that you have already sent this user a copy.
  For an executable, the required form of the "work that uses the
 Library" must include any data and utility programs needed for
 reproducing the executable from it.  However, as a special exception,
 the materials to be distributed need not include anything that is
 normally distributed (in either source or binary form) with the major
 components (compiler, kernel, and so on) of the operating system on
 which the executable runs, unless that component itself accompanies
 the executable.
  It may happen that this requirement contradicts the license
 restrictions of other proprietary libraries that do not normally
 accompany the operating system.  Such a contradiction means you cannot
 use both them and the Library together in an executable that you
 distribute.
  7. You may place library facilities that are a work based on the
 Library side-by-side in a single library together with other library
 facilities not covered by this License, and distribute such a combined
 library, provided that the separate distribution of the work based on
 the Library and of the other library facilities is otherwise
 permitted, and provided that you do these two things:
    a) Accompany the combined library with a copy of the same work
    based on the Library, uncombined with any other library
    facilities.  This must be distributed under the terms of the
    Sections above.
    b) Give prominent notice with the combined library of the fact
    that part of it is a work based on the Library, and explaining
    where to find the accompanying uncombined form of the same work.
  8. You may not copy, modify, sublicense, link with, or distribute
 the Library except as expressly provided under this License.  Any
 attempt otherwise to copy, modify, sublicense, link with, or
 distribute the Library is void, and will automatically terminate your
 rights under this License.  However, parties who have received copies,
 or rights, from you under this License will not have their licenses
 terminated so long as such parties remain in full compliance.
  9. You are not required to accept this License, since you have not
 signed it.  However, nothing else grants you permission to modify or
 distribute the Library or its derivative works.  These actions are
 prohibited by law if you do not accept this License.  Therefore, by
 modifying or distributing the Library (or any work based on the
 Library), you indicate your acceptance of this License to do so, and
 all its terms and conditions for copying, distributing or modifying
 the Library or works based on it.
  10. Each time you redistribute the Library (or any work based on the
 Library), the recipient automatically receives a license from the
 original licensor to copy, distribute, link with or modify the Library
 subject to these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties with
 this License.
  11. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
 excuse you from the conditions of this License.  If you cannot
 distribute so as to satisfy simultaneously your obligations under this
 License and any other pertinent obligations, then as a consequence you
 may not distribute the Library at all.  For example, if a patent
 license would not permit royalty-free redistribution of the Library by
 all those who receive copies directly or indirectly through you, then
 the only way you could satisfy both it and this License would be to
 refrain entirely from distribution of the Library.
 If any portion of this section is held invalid or unenforceable under any
 particular circumstance, the balance of the section is intended to apply,
 and the section as a whole is intended to apply in other circumstances.
 It is not the purpose of this section to induce you to infringe any
 patents or other property right claims or to contest validity of any
 such claims; this section has the sole purpose of protecting the
 integrity of the free software distribution system which is
 implemented by public license practices.  Many people have made
 generous contributions to the wide range of software distributed
 through that system in reliance on consistent application of that
 system; it is up to the author/donor to decide if he or she is willing
 to distribute software through any other system and a licensee cannot
 impose that choice.
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
  12. If the distribution and/or use of the Library is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Library under this License may add
 an explicit geographical distribution limitation excluding those countries,
 so that distribution is permitted only in or among countries not thus
 excluded.  In such case, this License incorporates the limitation as if
 written in the body of this License.
  13. The Free Software Foundation may publish revised and/or new
 versions of the Lesser General Public License from time to time.
 Such new versions will be similar in spirit to the present version,
 but may differ in detail to address new problems or concerns.
 Each version is given a distinguishing version number.  If the Library
 specifies a version number of this License which applies to it and
 "any later version", you have the option of following the terms and
 conditions either of that version or of any later version published by
 the Free Software Foundation.  If the Library does not specify a
 license version number, you may choose any version ever published by
 the Free Software Foundation.
  14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
 copyrighted by the Free Software Foundation, write to the Free
 Software Foundation; we sometimes make exceptions for this.  Our
 decision will be guided by the two goals of preserving the free status
 of all derivatives of our free software and of promoting the sharing
 and reuse of software generally.
                            NO WARRANTY
  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
 WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
 EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
 OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
 KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
 LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
 THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
 WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
 AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
 FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
 CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
 LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
 RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
 FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
 SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.
                     END OF TERMS AND CONDITIONS
           How to Apply These Terms to Your New Libraries
  If you develop a new library, and you want it to be of the greatest
 possible use to the public, we recommend making it free software that
 everyone can redistribute and change.  You can do so by permitting
 redistribution under these terms (or, alternatively, under the terms of the
 ordinary General Public License).
  To apply these terms, attach the following notices to the library.  It is
 safest to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least the
 "copyright" line and a pointer to where the full notice is found.
    <one line to give the library's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.
    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.
    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
    USA
 Also add information on how to contact you by electronic and paper mail.
 You should also get your employer (if you work as a programmer) or your
 school, if any, to sign a "copyright disclaimer" for the library, if
 necessary.  Here is a sample; alter the names:
  Yoyodyne, Inc., hereby disclaims all copyright interest in the
  library `Frob' (a library for tweaking knobs) written by James Random
  Hacker.
  <signature of Ty Coon>, 1 April 1990
  Ty Coon, President of Vice
 That's all there is to it!
--- a/LibLCGConfig.cmake.in
+++ b/LibLCGConfig.cmake.in
@@ -0,0 +1,20 @@
@PACKAGE_INIT@
 set(@PROJECT_NAME@_Version "@PROJECT_VERSION@")
 set_and_check(@PROJECT_NAME@_INSTALL_PREFIX "${PACKAGE_PREFIX_DIR}")
 set_and_check(@PROJECT_NAME@_INC_DIR "${PACKAGE_PREFIX_DIR}/include")
 set_and_check(@PROJECT_NAME@_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/include")
 set_and_check(@PROJECT_NAME@_LIB_DIR "${PACKAGE_PREFIX_DIR}/lib")
 set_and_check(@PROJECT_NAME@_LIBRARY_DIR "${PACKAGE_PREFIX_DIR}/lib")
 set(@PROJECT_NAME@_LIB lcg)
 set(@PROJECT_NAME@_LIBRARY lcg)
 set(@PROJECT_NAME@_FOUND 1)
 set(@PROJECT_NAME@_OPENMP @LibLCG_OPENMP@)
 set(@PROJECT_NAME@_EIGEN @LibLCG_EIGEN@)
 set(@PROJECT_NAME@_STD_COMPLEX @LibLCG_STD_COMPLEX@)
 set(@PROJECT_NAME@_CUDA @LibLCG_CUDA@)
 # include target information
 include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
--- a/README.md
+++ b/README.md
@@ -1,2 +1,225 @@
-# liblcg
+# C++ Library of the Linear Conjugate Gradient Methods (LibLCG) 说明文档
 张壹（yizhang-geo@zju.edu.cn）
 _浙江大学地球科学学院·地球物理研究所_
 **此说明仅覆盖算法库的简单介绍及使用，更详细的内容请查看代码注释。如果还有问题，请发邮件联系我。同时也欢迎有兴趣的同学加入开发团队！**
 ## 简介
 liblcg 是一个高效的、可扩展的 C++ 线性共轭梯度算法库，在原生数据结构接口的基础上，同时提供基于Eigen3和CUDA的算法接口，可以方便的实现基于CPU或GPU并行的加速计算，其中基于Eigen3的算法包含了稠密与稀疏矩阵的实现，而基于CUDA的算法主要为稀疏矩阵的实现。liblcg 包含多种实数与复数域的共轭梯度算法与其他一些迭代求解方法。目前已有得方法包括共轭梯度法、预优的共轭梯度算法、共轭梯度平方算法、双稳共轭梯度算法、BB步共轭梯度投影法与SPG共轭梯度投影法；复数域的双共轭梯度法、共轭梯度平方法、预优的共轭梯度法与TFQMR法。共轭梯度法广泛应用于无约束与不等式约束的线性最优化问题，拥有优良的收敛与计算效率。
 共轭梯度算法可用于求解如下形式的线性方程组：
 ```
 Ax = B
 ```
 其中，A 是一个 N 阶的方阵、x 为 N\*1 大小的待求解的模型向量，B 为 N\*1 大小的需拟合的目标向量。需要注意的是，不同种类的共轭梯度算法对A可能有不同的要求，比如必须是正定的，或者对称的。不同算法的具体要求可以查阅其他参考文献或者查看代码中的注释。
 ## 安装
 算法库使用 CMake 工具进行汇编，可在不同操作平台生成相应的Makefile或工程文件。
 ### 编译选项
 算法库目前可用的编译选项有：
 * LibLCG_OPENMP：是否使用OpenMP进行加速，需要安装OpeMP。默认为ON。
 * LibLCG_EIGEN：是否编译基于Eigen的算法与借口，需要安装Eigen。默认为ON。
 * LibLCG_STD_COMPLEX：是否使用std::complex\<double\>作为复数的默认类型。默认为ON。
 * LibLCG_CUDA：是否编译基于CUDA的算法与借口，需要安装CUDA。默认为ON。
 用户可以使用cmake命令中的-D选项对编译选项进行设置，比如关闭LibLCG_Eigen：
 ```shell
 cmake -DLibLCG_EIGEN=OFF
 ```
 ### Linux 与 MacOS
 liblcg的默认安装路径为 /usr/local。头文件与动态库分别安装于 include 与 lib 文件夹。具体的编译与安装步骤如下：
 1. 下载安装CMake软件；
 2. 下载安装GCC编译器（常见系统已内置）；
 3. 在源文件路径内使用如下命令进行编译与安装：
 ```shell
 mkdir build && cd build && cmake .. && make install
 ```
 ### Windows
 #### MinGW 和 GCC
 Windows系统不包含GNU编译环境，用户需自行下载并配置。方法如下：
 1. 下载MinGW安装文件，并选择gcc、pthreads与make相关软件包安装；
 2. 下载安装CMake软件；
 3. 添加CMake与MinGW可执行文件路径至Windows环境变量；
 4. 在源文件路径内使用如下命令进行编译与安装：
 ```shell
 mkdir build && cd build && cmake .. -G "MinGW Makefiles" && make install
 ```
 默认的安装路径为C:/Program\\ Files。头文件与动态库分别安装于 include 与 lib 文件夹。
 **注意：用户需要手动添加头文件与动态库地址到计算机的环境变量中。**
 #### Visual Studio
 用户可使用CMake工具构建VS工程文件并编译使用动态库。方法如下：
 1. 下载安装 Visual Studio 软件；
 2. 下载安装CMake软件；
 3. 在源文件路径内使用如下命令生成VS工程文件：
 ```shell
 mkdir build && cd build && cmake .. -G "Visual Studio 16 2019"
 ```
 _注：如需生成其他版本的VS工程文件，请使用-G命令查看相应的识别码。_
 4. 使用 Visual Studio 打开.sln工程文件并编译动态库。
 ## 使用与编译
 用户使用库函数时需在源文件中引入相应的头文件，如：
 ```cpp
 #include "lcg/lcg.h"
 ```
 编译可执行文件时需链接lcg动态库。以g++为例：
 ```shell
 g++ example.cpp -llcg -o example_out
 ```
 ## 快速开始
 要使用liblcg求解线性方程组Ax=B，用户需要定义Ax乘积的计算函数（回调函数），该函数的功能为计算不同的x所对应的乘积Ax。以实数类型的共轭梯度算法为例，其回调函数的接口定义为：
 ```cpp
 typedef void (*lcg_axfunc_ptr)(void* instance, const lcg_float* x, lcg_float* prod_Ax, const int n_size);
 ```
 其中，`x`为输入的向量，`prod_Ax`为返回的乘积向量，`n`为这两个向量的长度。注意此处参数列表中并不包含矩阵A，这意味这A必须为全局或者类变量。这样设计的主要原因是在某些复杂最优化问题的编程中，计算并存储A并不实际或者划算，此时一般采用的策略是存储相关变量且仅计算Ax的乘积，所以矩阵A并不总是存在。
 用户在定义Ax计算函数后即可调用求解函数 lcg_solver() 对线性方程组进行求解。以无约束的求解函数为例，其声明如下：
 ```cpp
 int lcg_solver(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
 	const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_CGS);
 ```
 其中：
 1. `lcg_axfunc_ptr Afp` 为正演计算的回调函数；
 2. `lcg_progress_ptr Pfp` 监控迭代过程的回调函数（非必须，无需监控时使用 nullptr 参数即可）；
 3. `lcg_float* m` 初始解向量，迭代取得的解也保存与此数组；
 4. `const lcg_float* B` Ax = B 中的 B 项；
 5. `const int n_size` 解向量的大小；
 6. `const lcg_para* param` 迭代使用的参数，此参数为 nullptr 即使用默认参数；
 7. `void* instance` 传入的实例对象, 此函数在类中使用即为类的 this 指针, 在普通函数中使用时即为 nullptr；
 8. `int solver_id` 求解函数使用的求解方法，具体的方法代号可查看对应的头文件；
 ### 一个简单的例子
 ```cpp
 #include "cmath"
 #include "iostream"
 #include "lcg/lcg.h"
 #define M 100
 #define N 80
 // 返回两个数组元素之间的最大差值
 lcg_float max_diff(const lcg_float *a, const lcg_float *b, int size)
 {
 	lcg_float max = -1;
 	for (int i = 0; i < size; i++)
 	{
 		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
 	}
 	return max;
 }
 // 普通二维数组做核矩阵
 lcg_float **kernel;
 // 中间结果数组
 lcg_float *tmp_arr;
 // 计算核矩阵乘向量的乘积 lcg_solver的回调函数
 void CalAx(void* instance, const lcg_float* x, lcg_float* prod_Ax, const int n_s)
 {
    // 注意核矩阵实际为 kernel^T * kernel，大小为N*N
 	lcg_matvec(kernel, x, tmp_arr, M, n_s, MatNormal); // tmp_tar = kernel * x
 	lcg_matvec(kernel, tmp_arr, prod_Ax, M, n_s, MatTranspose); // prod_Ax = kernel^T * tmp_tar
 	return;
 }
 // 定义监控函数 lcg_solver的回调函数
 // 这个函数显示当前的迭代次数与收敛值
 int Prog(void* instance, const lcg_float* m, const lcg_float converge, const lcg_para* param, const int n_s, const int k)
 {
 	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	return 0;
 }
 int main(int argc, char const *argv[])
 {
    // 开辟数组空间
 	kernel = lcg_malloc(M, N);
 	tmp_arr = lcg_malloc(M);
    // 为核矩阵赋初值
 	lcg_vecrnd(kernel, -1.0, 1.0, M, N);
 	// 生成一组理论解
 	lcg_float *fm = lcg_malloc(N);
 	lcg_vecrnd(fm, 1.0, 2.0, N);
 	// 计算共轭梯度B项
 	lcg_float *B = lcg_malloc(N);
 	lcg_matvec(kernel, fm, tmp_arr, M, N, MatNormal);
 	lcg_matvec(kernel, tmp_arr, B, M, N, MatTranspose);
 	// 设置共轭梯度参数
 	lcg_para self_para = lcg_default_parameters();
 	self_para.epsilon = 1e-5;
 	self_para.abs_diff = 0;
 	// 声明一组解
 	lcg_float *m = lcg_malloc(N);
 	lcg_vecset(m, 0.0, N);
 	// 使用标准共轭梯度方法（LCG_CG）求解线性方程组
    // 将回调函数传递给solver
    // 由于回调函数为全局函数，因此instance变量的值为NULL
 	int ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_CG);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
    // 销毁数组
 	lcg_free(kernel, M);
 	lcg_free(tmp_arr);
 	lcg_free(fm);
 	lcg_free(B);
 	lcg_free(m);
 	return 0;
 }
 ```
 **完整的例子储存在[sample](src/sample)文件夹内。**
 ## 类模版
 liblcg为不同类型的共轭梯度算法定义了通用的求解类模版，包含了类中函数的指针代理及通用的监控函数实现，用户可直接继承并使用。需要注意的是这些类模版中定义了纯虚的函数接口，用户需要全部实现。其中没用到的定义成空函数就行了。以实数的求解类模版为例，需要实现的接口函数包括：
 ```cpp
 void AxProduct(const lcg_float* a, lcg_float* b, const int num) = 0
 void MxProduct(const lcg_float* a, lcg_float* b, const int num) = 0
 ```
 其中`AxProduct`是Ax的计算函数，`MxProduct`是预优过程的计算函数，即M^-1x。
--- a/config.h.in
+++ b/config.h.in
@@ -0,0 +1,4 @@
 #cmakedefine LibLCG_OPENMP
 #cmakedefine LibLCG_EIGEN
 #cmakedefine LibLCG_STD_COMPLEX
 #cmakedefine LibLCG_CUDA
--- a/data/README
+++ b/data/README
@@ -0,0 +1,11 @@
 case_*_A: Full symmetric matrix
 [ N (int) | nz (int) ]
 [ RowIdx (int) | ColIdx (int) | Val (double) ] * nz
 [ b (double) * N ]
 [ d (double) * N ] (complex matrix only)
 case_*_B: Vector
 [ N (int) ]
 [ x (double) * N]
--- a/data/cases.7z
+++ b/data/cases.7z
--- a/data/get_cdat.cpp
+++ b/data/get_cdat.cpp
@@ -0,0 +1,105 @@
 #include "../src/lib/lcg_complex.h"
 #include "iostream"
 #include "fstream"
 #include "vector"
 #include "Eigen/Sparse"
 #define random(x) (rand()%x)
 typedef Eigen::SparseMatrix<lcg_complex, Eigen::RowMajor> spmat_cd; // 注意Eigen默认的稀疏矩阵排序为列优先
 typedef Eigen::Triplet<lcg_complex> triplt_cd;
 int main(int argc, char const *argv[])
 {
    int N = 1000000;
    int nz = 1013000;
    lcg_complex *v = new lcg_complex[nz];
    lcg_complex *x = new lcg_complex[N];
    lcg_complex *b = new lcg_complex[N];
    lcg_complex one(1.0, 1.0), none(-1.0, -1.0), zero(0.0, 0.0);
    clcg_vecrnd(v, 1.0*one, 10.0*one, nz);
    clcg_vecrnd(x, 1.0*one, 2.0*one, N);
    clcg_vecset(b, zero, N);
    std::vector<triplt_cd> val_triplt;
    val_triplt.reserve(2*(nz-N) + N);
    for (size_t i = 0; i < N; i++)
    {
        val_triplt.push_back(triplt_cd(i, i, v[i]));
        b[i] += v[i]*x[i];
    }
    srand((int)time(0));
    int r, c;
    size_t j = N;
    while (j < nz)
    {
        r = random(N);
        c = random(N);
        if (r != c)
        {
            val_triplt.push_back(triplt_cd(r, c, v[j]));
            val_triplt.push_back(triplt_cd(c, r, v[j]));
            b[r] += v[j]*x[c];
            b[c] += v[j]*x[r];
            j++;
        }
    }
    spmat_cd A;
    A.resize(N, N);
    A.setZero();
    A.setFromTriplets(val_triplt.begin(), val_triplt.end());
    std::ofstream Aout, Bout;
    Aout.open("case_1M_cA", std::ios::binary);
    Bout.open("case_1M_cB", std::ios::binary);
    lcg_complex tmp;
    nz = A.nonZeros();
    Aout.write((char*)&N, sizeof(int));
    Aout.write((char*)&nz, sizeof(int));
    for (size_t i = 0; i < N; i++)
    {
        for (Eigen::SparseMatrix<lcg_complex, Eigen::RowMajor>::InnerIterator it(A, i); it; ++it) // 列循环
        {
            r = it.row();
            c = it.col();
            tmp = it.value();
            Aout.write((char*)&r, sizeof(int));
            Aout.write((char*)&c, sizeof(int));
            Aout.write((char*)&tmp, sizeof(lcg_complex));
        }
    }
    for (size_t i = 0; i < N; i++)
    {
        tmp = b[i];
        Aout.write((char*)&tmp, sizeof(lcg_complex));
    }
    Aout.close();
    Bout.write((char*)&N, sizeof(int));
    for (size_t i = 0; i < N; i++)
    {
        tmp = x[i];
        Bout.write((char*)&tmp, sizeof(lcg_complex));
    }
    Bout.close();
    delete[] v;
    delete[] x;
    delete[] b;
    return 0;
 }
--- a/doxy/doxygen.sty
+++ b/doxy/doxygen.sty
@@ -0,0 +1,577 @@
 % stylesheet for doxygen 1.8.17
 \NeedsTeXFormat{LaTeX2e}
 \ProvidesPackage{doxygen}
 % Packages used by this style file
 \RequirePackage{alltt}
 %%\RequirePackage{array} %% moved to refman.tex due to workaround for LaTex 2019 version and unmaintained tabu package
 \RequirePackage{calc}
 \RequirePackage{float}
 %%\RequirePackage{ifthen} %% moved to refman.tex due to workaround for LaTex 2019 version and unmaintained tabu package
 \RequirePackage{verbatim}
 \RequirePackage[table]{xcolor}
 \RequirePackage{longtable_doxygen}
 \RequirePackage{tabu_doxygen}
 \RequirePackage{fancyvrb}
 \RequirePackage{tabularx}
 \RequirePackage{multirow}
 \RequirePackage{hanging}
 \RequirePackage{ifpdf}
 \RequirePackage{adjustbox}
 \RequirePackage{amssymb}
 \RequirePackage{stackengine}
 \RequirePackage[normalem]{ulem} % for strikeout, but don't modify emphasis
 %---------- Internal commands used in this style file ----------------
 \newcommand{\ensurespace}[1]{%
  \begingroup%
    \setlength{\dimen@}{#1}%
    \vskip\z@\@plus\dimen@%
    \penalty -100\vskip\z@\@plus -\dimen@%
    \vskip\dimen@%
    \penalty 9999%
    \vskip -\dimen@%
    \vskip\z@skip% hide the previous |\vskip| from |\addvspace|
  \endgroup%
 }
 \newcommand{\DoxyHorRuler}[1]{%
  \setlength{\parskip}{0ex plus 0ex minus 0ex}%
  \ifthenelse{#1=0}%
  {%
    \hrule%
  }%
  {%
    \hrulefilll%
  }%
 }
 \newcommand{\DoxyLabelFont}{}
 \newcommand{\entrylabel}[1]{%
  {%
    \parbox[b]{\labelwidth-4pt}{%
      \makebox[0pt][l]{\DoxyLabelFont#1}%
      \vspace{1.5\baselineskip}%
    }%
  }%
 }
 \newenvironment{DoxyDesc}[1]{%
  \ensurespace{4\baselineskip}%
  \begin{list}{}{%
    \settowidth{\labelwidth}{20pt}%
    %\setlength{\parsep}{0pt}%
    \setlength{\itemsep}{0pt}%
    \setlength{\leftmargin}{\labelwidth+\labelsep}%
    \renewcommand{\makelabel}{\entrylabel}%
  }%
  \item[#1]%
 }{%
  \end{list}%
 }
 \newsavebox{\xrefbox}
 \newlength{\xreflength}
 \newcommand{\xreflabel}[1]{%
  \sbox{\xrefbox}{#1}%
  \setlength{\xreflength}{\wd\xrefbox}%
  \ifthenelse{\xreflength>\labelwidth}{%
    \begin{minipage}{\textwidth}%
      \setlength{\parindent}{0pt}%
      \hangindent=15pt\bfseries #1\vspace{1.2\itemsep}%
    \end{minipage}%
  }{%
   \parbox[b]{\labelwidth}{\makebox[0pt][l]{\textbf{#1}}}%
  }%
 }
 %---------- Commands used by doxygen LaTeX output generator ----------
 % Used by <pre> ... </pre>
 \newenvironment{DoxyPre}{%
  \small%
  \begin{alltt}%
 }{%
  \end{alltt}%
  \normalsize%
 }
 % Necessary for redefining not defined characters, i.e. "Replacement Character" in tex output.
 \newlength{\CodeWidthChar}
 \newlength{\CodeHeightChar}
 \settowidth{\CodeWidthChar}{?}
 \settoheight{\CodeHeightChar}{?}
 % Necessary for hanging indent
 \newlength{\DoxyCodeWidth}
 \newcommand\DoxyCodeLine[1]{\hangpara{\DoxyCodeWidth}{1}{#1}\par}
 \newcommand\NiceSpace{%
     \discretionary{}{\kern\fontdimen2\font}{\kern\fontdimen2\font}%
 }
 % Used by @code ... @endcode
 \newenvironment{DoxyCode}[1]{%
  \par%
  \scriptsize%
  \normalfont\ttfamily%
  \rightskip0pt plus 1fil%
  \settowidth{\DoxyCodeWidth}{000000}%
  \settowidth{\CodeWidthChar}{?}%
  \settoheight{\CodeHeightChar}{?}%
  \setlength{\parskip}{0ex plus 0ex minus 0ex}%
  \ifthenelse{\equal{#1}{0}}
  {
    {\lccode`~32 \lowercase{\global\let~}\NiceSpace}\obeyspaces%
  }
  {
    {\lccode`~32 \lowercase{\global\let~}}\obeyspaces%
  }
 }{%
  \normalfont%
  \normalsize%
  \settowidth{\CodeWidthChar}{?}%
  \settoheight{\CodeHeightChar}{?}%
 }
 % Redefining not defined characters, i.e. "Replacement Character" in tex output.
 \def\ucr{\adjustbox{width=\CodeWidthChar,height=\CodeHeightChar}{\stackinset{c}{}{c}{-.2pt}{%
   \textcolor{white}{\sffamily\bfseries\small ?}}{%
   \rotatebox{45}{$\blacksquare$}}}}
 % Used by @example, @include, @includelineno and @dontinclude
 \newenvironment{DoxyCodeInclude}[1]{%
 	\DoxyCode{#1}%
 }{%
  \endDoxyCode%
 }
 % Used by @verbatim ... @endverbatim
 \newenvironment{DoxyVerb}{%
  \footnotesize%
  \verbatim%
 }{%
  \endverbatim%
  \normalsize%
 }
 % Used by @verbinclude
 \newenvironment{DoxyVerbInclude}{%
  \DoxyVerb%
 }{%
  \endDoxyVerb%
 }
 % Used by numbered lists (using '-#' or <ol> ... </ol>)
 \newenvironment{DoxyEnumerate}{%
  \enumerate%
 }{%
  \endenumerate%
 }
 % Used by bullet lists (using '-', @li, @arg, or <ul> ... </ul>)
 \newenvironment{DoxyItemize}{%
  \itemize%
 }{%
  \enditemize%
 }
 % Used by description lists (using <dl> ... </dl>)
 \newenvironment{DoxyDescription}{%
  \description%
 }{%
  \enddescription%
 }
 % Used by @image, @dotfile, @dot ... @enddot, and @msc ... @endmsc
 % (only if caption is specified)
 \newenvironment{DoxyImage}{%
  \begin{figure}[H]%
    \begin{center}%
 }{%
    \end{center}%
  \end{figure}%
 }
 % Used by @image, @dotfile, @dot ... @enddot, and @msc ... @endmsc
 % (only if no caption is specified)
 \newenvironment{DoxyImageNoCaption}{%
  \begin{center}%
 }{%
  \end{center}%
 }
 % Used by @image
 % (only if inline is specified)
 \newenvironment{DoxyInlineImage}{%
 }{%
 }
 % Used by @attention
 \newenvironment{DoxyAttention}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @author and @authors
 \newenvironment{DoxyAuthor}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @date
 \newenvironment{DoxyDate}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @invariant
 \newenvironment{DoxyInvariant}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @note
 \newenvironment{DoxyNote}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @post
 \newenvironment{DoxyPostcond}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @pre
 \newenvironment{DoxyPrecond}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @copyright
 \newenvironment{DoxyCopyright}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @remark
 \newenvironment{DoxyRemark}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @return and @returns
 \newenvironment{DoxyReturn}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @since
 \newenvironment{DoxySince}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @see
 \newenvironment{DoxySeeAlso}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @version
 \newenvironment{DoxyVersion}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @warning
 \newenvironment{DoxyWarning}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by @internal
 \newenvironment{DoxyInternal}[1]{%
  \paragraph*{#1}%
 }{%
 }
 % Used by @par and @paragraph
 \newenvironment{DoxyParagraph}[1]{%
  \begin{DoxyDesc}{#1}%
 }{%
  \end{DoxyDesc}%
 }
 % Used by parameter lists
 \newenvironment{DoxyParams}[2][]{%
    \tabulinesep=1mm%
    \par%
    \ifthenelse{\equal{#1}{}}%
      {\begin{longtabu*}spread 0pt [l]{|X[-1,l]|X[-1,l]|}}% name + description
    {\ifthenelse{\equal{#1}{1}}%
      {\begin{longtabu*}spread 0pt [l]{|X[-1,l]|X[-1,l]|X[-1,l]|}}% in/out + name + desc
      {\begin{longtabu*}spread 0pt [l]{|X[-1,l]|X[-1,l]|X[-1,l]|X[-1,l]|}}% in/out + type + name + desc
    }
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #2}\\[1ex]%
    \hline%
    \endfirsthead%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #2}\\[1ex]%
    \hline%
    \endhead%
 }{%
    \end{longtabu*}%
    \vspace{6pt}%
 }
 % Used for fields of simple structs
 \newenvironment{DoxyFields}[1]{%
    \tabulinesep=1mm%
    \par%
    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|X[-1,l]|}%
    \multicolumn{3}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endfirsthead%
    \multicolumn{3}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endhead%
 }{%
    \end{longtabu*}%
    \vspace{6pt}%
 }
 % Used for fields simple class style enums
 \newenvironment{DoxyEnumFields}[1]{%
    \tabulinesep=1mm%
    \par%
    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endfirsthead%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endhead%
 }{%
    \end{longtabu*}%
    \vspace{6pt}%
 }
 % Used for parameters within a detailed function description
 \newenvironment{DoxyParamCaption}{%
  \renewcommand{\item}[2][]{\\ \hspace*{2.0cm} ##1 {\em ##2}}% 
 }{%
 }
 % Used by return value lists
 \newenvironment{DoxyRetVals}[1]{%
    \tabulinesep=1mm%
    \par%
    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endfirsthead%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endhead%
 }{%
    \end{longtabu*}%
    \vspace{6pt}%
 }
 % Used by exception lists
 \newenvironment{DoxyExceptions}[1]{%
    \tabulinesep=1mm%
    \par%
    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endfirsthead%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endhead%
 }{%
    \end{longtabu*}%
    \vspace{6pt}%
 }
 % Used by template parameter lists
 \newenvironment{DoxyTemplParams}[1]{%
    \tabulinesep=1mm%
    \par%
    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endfirsthead%
    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
    \hline%
    \endhead%
 }{%
    \end{longtabu*}%
    \vspace{6pt}%
 }
 % Used for member lists
 \newenvironment{DoxyCompactItemize}{%
  \begin{itemize}%
    \setlength{\itemsep}{-3pt}%
    \setlength{\parsep}{0pt}%
    \setlength{\topsep}{0pt}%
    \setlength{\partopsep}{0pt}%
 }{%
  \end{itemize}%
 }
 % Used for member descriptions
 \newenvironment{DoxyCompactList}{%
  \begin{list}{}{%
    \setlength{\leftmargin}{0.5cm}%
    \setlength{\itemsep}{0pt}%
    \setlength{\parsep}{0pt}%
    \setlength{\topsep}{0pt}%
    \renewcommand{\makelabel}{\hfill}%
  }%
 }{%
  \end{list}%
 }
 % Used for reference lists (@bug, @deprecated, @todo, etc.)
 \newenvironment{DoxyRefList}{%
  \begin{list}{}{%
    \setlength{\labelwidth}{10pt}%
    \setlength{\leftmargin}{\labelwidth}%
    \addtolength{\leftmargin}{\labelsep}%
    \renewcommand{\makelabel}{\xreflabel}%
  }%
 }{%
  \end{list}%
 }
 % Used by @bug, @deprecated, @todo, etc.
 \newenvironment{DoxyRefDesc}[1]{%
  \begin{list}{}{%
    \renewcommand\makelabel[1]{\textbf{##1}}%
    \settowidth\labelwidth{\makelabel{#1}}%
    \setlength\leftmargin{\labelwidth+\labelsep}%
  }%
 }{%
  \end{list}%
 }
 % Used by parameter lists and simple sections
 \newenvironment{Desc}
 {\begin{list}{}{%
    \settowidth{\labelwidth}{20pt}%
    \setlength{\parsep}{0pt}%
    \setlength{\itemsep}{0pt}%
    \setlength{\leftmargin}{\labelwidth+\labelsep}%
    \renewcommand{\makelabel}{\entrylabel}%
  }
 }{%
  \end{list}%
 }
 % Used by tables
 \newcommand{\PBS}[1]{\let\temp=\\#1\let\\=\temp}%
 \newenvironment{TabularC}[1]%
 {\tabulinesep=1mm
 \begin{longtabu*}spread 0pt [c]{*#1{|X[-1]}|}}%
 {\end{longtabu*}\par}%
 \newenvironment{TabularNC}[1]%
 {\begin{tabu}spread 0pt [l]{*#1{|X[-1]}|}}%
 {\end{tabu}\par}%
 % Used for member group headers
 \newenvironment{Indent}{%
  \begin{list}{}{%
    \setlength{\leftmargin}{0.5cm}%
  }%
  \item[]\ignorespaces%
 }{%
  \unskip%
  \end{list}%
 }
 % Used when hyperlinks are turned off
 \newcommand{\doxyref}[3]{%
  \textbf{#1} (\textnormal{#2}\,\pageref{#3})%
 }
 % Used to link to a table when hyperlinks are turned on
 \newcommand{\doxytablelink}[2]{%
  \ref{#1}%
 }
 % Used to link to a table when hyperlinks are turned off
 \newcommand{\doxytableref}[3]{%
  \ref{#3}%
 }
 % Used by @addindex
 \newcommand{\lcurly}{\{}
 \newcommand{\rcurly}{\}}
 % Colors used for syntax highlighting
 \definecolor{comment}{rgb}{0.5,0.0,0.0}
 \definecolor{keyword}{rgb}{0.0,0.5,0.0}
 \definecolor{keywordtype}{rgb}{0.38,0.25,0.125}
 \definecolor{keywordflow}{rgb}{0.88,0.5,0.0}
 \definecolor{preprocessor}{rgb}{0.5,0.38,0.125}
 \definecolor{stringliteral}{rgb}{0.0,0.125,0.25}
 \definecolor{charliteral}{rgb}{0.0,0.5,0.5}
 \definecolor{vhdldigit}{rgb}{1.0,0.0,1.0}
 \definecolor{vhdlkeyword}{rgb}{0.43,0.0,0.43}
 \definecolor{vhdllogic}{rgb}{1.0,0.0,0.0}
 \definecolor{vhdlchar}{rgb}{0.0,0.0,0.0}
 % Color used for table heading
 \newcommand{\tableheadbgcolor}{lightgray}%
 % Version of hypertarget with correct landing location
 \newcommand{\Hypertarget}[1]{\Hy@raisedlink{\hypertarget{#1}{}}}
 % possibility to have sections etc. be within the margins
 % unfortunately had to copy part of book.cls and add \raggedright
 \makeatletter
 \newcommand\doxysection{\@startsection {section}{1}{\z@}%
                                   {-3.5ex \@plus -1ex \@minus -.2ex}%
                                   {2.3ex \@plus.2ex}%
                                   {\raggedright\normalfont\Large\bfseries}}
 \newcommand\doxysubsection{\@startsection{subsection}{2}{\z@}%
                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
                                     {1.5ex \@plus .2ex}%
                                     {\raggedright\normalfont\large\bfseries}}
 \newcommand\doxysubsubsection{\@startsection{subsubsection}{3}{\z@}%
                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
                                     {1.5ex \@plus .2ex}%
                                     {\raggedright\normalfont\normalsize\bfseries}}
 \newcommand\doxyparagraph{\@startsection{paragraph}{4}{\z@}%
                                    {3.25ex \@plus1ex \@minus.2ex}%
                                    {-1em}%
                                    {\raggedright\normalfont\normalsize\bfseries}}
 \newcommand\doxysubparagraph{\@startsection{subparagraph}{5}{\parindent}%
                                       {3.25ex \@plus1ex \@minus .2ex}%
                                       {-1em}%
                                      {\raggedright\normalfont\normalsize\bfseries}}
 \makeatother
 % Define caption that is also suitable in a table
 \makeatletter
 \def\doxyfigcaption{%
 \refstepcounter{figure}%
 \@dblarg{\@caption{figure}}}
 \makeatother
--- a/doxy/footer.tex
+++ b/doxy/footer.tex
@@ -0,0 +1,12 @@
 % Latex footer for doxygen 1.8.17
 %--- End generated contents ---
 % Index
 \backmatter
 \newpage
 \phantomsection
 \clearemptydoublepage
 \addcontentsline{toc}{chapter}{\indexname}
 \printindex
 \end{document}
--- a/doxy/header.tex
+++ b/doxy/header.tex
@@ -0,0 +1,174 @@
 % Latex header for doxygen 1.8.17
 \let\mypdfximage\pdfximage\def\pdfximage{\immediate\mypdfximage}\documentclass[twoside]{book}
 %% moved from doxygen.sty due to workaround for LaTex 2019 version and unmaintained tabu package
 \usepackage{ifthen}
 \ifx\requestedLaTeXdate\undefined
 \usepackage{array}
 \else
 \usepackage{array}[=2016-10-06]
 \fi
 %%
 % Packages required by doxygen
 \usepackage{fixltx2e}
 \usepackage{calc}
 \usepackage{doxygen}
 \usepackage{graphicx}
 \usepackage[utf8]{inputenc}
 \usepackage{makeidx}
 \usepackage{multicol}
 \usepackage{multirow}
 \PassOptionsToPackage{warn}{textcomp}
 \usepackage{textcomp}
 \usepackage[nointegrals]{wasysym}
 \usepackage[table]{xcolor}
 \usepackage{ifpdf,ifxetex}
 % Font selection
 \usepackage[T1]{fontenc}
 \usepackage[scaled=.90]{helvet}
 \usepackage{courier}
 \usepackage{amssymb}
 \usepackage{sectsty}
 \renewcommand{\familydefault}{\sfdefault}
 \allsectionsfont{%
  \fontseries{bc}\selectfont%
  \color{darkgray}%
 }
 \renewcommand{\DoxyLabelFont}{%
  \fontseries{bc}\selectfont%
  \color{darkgray}%
 }
 \newcommand{\+}{\discretionary{\mbox{\scriptsize$\hookleftarrow$}}{}{}}
 % Arguments of doxygenemoji:
 % 1) ':<text>:' form of the emoji, already "LaTeX"-escaped
 % 2) file with the name of the emoji without the .png extension
 % in case image exist use this otherwise use the ':<text>:' form
 \newcommand{\doxygenemoji}[2]{%
  \IfFileExists{./#2.png}{\raisebox{-0.1em}{\includegraphics[height=0.9em]{./#2.png}}}{#1}%
 }
 % Page & text layout
 \usepackage{geometry}
 \geometry{%
  a4paper,%
  top=2.5cm,%
  bottom=2.5cm,%
  left=2.5cm,%
  right=2.5cm%
 }
 \tolerance=750
 \hfuzz=15pt
 \hbadness=750
 \setlength{\emergencystretch}{15pt}
 \setlength{\parindent}{0cm}
 \newcommand{\doxynormalparskip}{\setlength{\parskip}{3ex plus 2ex minus 2ex}}
 \newcommand{\doxytocparskip}{\setlength{\parskip}{1ex plus 0ex minus 0ex}}
 \doxynormalparskip
 \makeatletter
 \renewcommand{\paragraph}{%
  \@startsection{paragraph}{4}{0ex}{-1.0ex}{1.0ex}{%
    \normalfont\normalsize\bfseries\SS@parafont%
  }%
 }
 \renewcommand{\subparagraph}{%
  \@startsection{subparagraph}{5}{0ex}{-1.0ex}{1.0ex}{%
    \normalfont\normalsize\bfseries\SS@subparafont%
  }%
 }
 \makeatother
 \makeatletter
 \newcommand\hrulefilll{\leavevmode\leaders\hrule\hskip 0pt plus 1filll\kern\z@}
 \makeatother
 % Headers & footers
 \usepackage{fancyhdr}
 \pagestyle{fancyplain}
 \fancyhead[LE]{\fancyplain{}{\bfseries\thepage}}
 \fancyhead[CE]{\fancyplain{}{}}
 \fancyhead[RE]{\fancyplain{}{\bfseries\leftmark}}
 \fancyhead[LO]{\fancyplain{}{\bfseries\rightmark}}
 \fancyhead[CO]{\fancyplain{}{}}
 \fancyhead[RO]{\fancyplain{}{\bfseries\thepage}}
 \fancyfoot[LE]{\fancyplain{}{}}
 \fancyfoot[CE]{\fancyplain{}{}}
 \fancyfoot[RE]{\fancyplain{}{\bfseries\scriptsize Generated by Doxygen }}
 \fancyfoot[LO]{\fancyplain{}{\bfseries\scriptsize Generated by Doxygen }}
 \fancyfoot[CO]{\fancyplain{}{}}
 \fancyfoot[RO]{\fancyplain{}{}}
 \renewcommand{\footrulewidth}{0.4pt}
 \renewcommand{\chaptermark}[1]{%
  \markboth{#1}{}%
 }
 \renewcommand{\sectionmark}[1]{%
  \markright{\thesection\ #1}%
 }
 % Indices & bibliography
 \usepackage{natbib}
 \usepackage[titles]{tocloft}
 \setcounter{tocdepth}{3}
 \setcounter{secnumdepth}{5}
 \makeindex
 \usepackage{newunicodechar}
  \newunicodechar{⁻}{${}^{-}$}% Superscript minus
  \newunicodechar{²}{${}^{2}$}% Superscript two
  \newunicodechar{³}{${}^{3}$}% Superscript three
 % Hyperlinks (required, but should be loaded last)
 \ifpdf
  \usepackage[pdftex,pagebackref=true]{hyperref}
 \else
  \ifxetex
    \usepackage[pagebackref=true]{hyperref}
  \else
    \usepackage[ps2pdf,pagebackref=true]{hyperref}
  \fi
 \fi
 \hypersetup{%
  colorlinks=true,%
  linkcolor=blue,%
  citecolor=blue,%
  unicode%
 }
 % Custom commands
 \newcommand{\clearemptydoublepage}{%
  \newpage{\pagestyle{empty}\cleardoublepage}%
 }
 \usepackage{caption}
 \captionsetup{labelsep=space,justification=centering,font={bf},singlelinecheck=off,skip=4pt,position=top}
 \usepackage{etoc}
 \etocsettocstyle{\doxytocparskip}{\doxynormalparskip}
 \renewcommand{\numberline}[1]{#1~}
 %===== C O N T E N T S =====
 \begin{document}
 % Titlepage & ToC
 \hypersetup{pageanchor=false,
             bookmarksnumbered=true,
             pdfencoding=unicode
            }
 \pagenumbering{alph}
 \begin{titlepage}
 \vspace*{7cm}
 \begin{center}%
 {\Large C++ Library of the Linear Conjugate Gradient Methods (LibLCG)}\\
 \vspace*{1cm}
 {\large Yi Zhang}\\
 \end{center}
 \end{titlepage}
 \clearemptydoublepage
 \pagenumbering{roman}
 \tableofcontents
 \clearemptydoublepage
 \pagenumbering{arabic}
 \hypersetup{pageanchor=true}
 %--- Begin generated contents ---
--- a/refman.pdf
+++ b/refman.pdf
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,181 @@
 # 设定源文件文件夹
 aux_source_directory(lib LCGLIB_SRC)
 if(NOT LibLCG_EIGEN)
 	list(REMOVE_ITEM LCGLIB_SRC "lib/algebra_eigen.cpp")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/lcg_eigen.cpp")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/clcg_eigen.cpp")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/solver_eigen.cpp")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/preconditioner_eigen.cpp")
 endif()
 if(NOT LibLCG_CUDA)
 	list(REMOVE_ITEM LCGLIB_SRC "lib/algebra_cuda.cu")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/lcg_complex_cuda.cu")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/lcg_cuda.cu")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/clcg_cuda.cu")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/clcg_cuda_f.cu")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/solver_cuda.cu")
 	list(REMOVE_ITEM LCGLIB_SRC "lib/preconditioner_cuda.cu")
 endif()
 # 以下部分为库的编译
 # 注意目标名必须唯一 所以不能直接生成相同名称的动态库与静态库
 # 注意此处不必为目标名称添加lib前缀和相应后缀，cmake会自行添加
 add_library(lcg SHARED ${LCGLIB_SRC})
 # 首先添加静态库的生成命令
 add_library(lcg_static STATIC ${LCGLIB_SRC})
 # 设置静态库的输出名称从而获得与动态库名称相同的静态库
 set_target_properties(lcg_static PROPERTIES OUTPUT_NAME "lcg")
 # 设置输出目标属性以同时输出动态库与静态库
 set_target_properties(lcg PROPERTIES CLEAN_DIRECT_OUTPUT 1)
 set_target_properties(lcg_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
 if(LibLCG_CUDA)
 	set_target_properties(lcg PROPERTIES CUDA_ARCHITECTURES 70)
 	set_target_properties(lcg_static PROPERTIES CUDA_ARCHITECTURES 70)
 endif()
 # 设置动态库的版本号
 set_target_properties(lcg PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR})
 # 设置库文件的输出地址
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 # 设置编译选项
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
 if(LibLCG_EIGEN)
 	find_package(Eigen3 REQUIRED)
 	if(EIGEN3_FOUND)
 		message(STATUS "Eigen3 Found.")
 		include_directories(${EIGEN3_INCLUDE_DIR})
 	endif()
 endif()
 if(LibLCG_CUDA)
 	enable_language(CUDA)
 	find_package(CUDA REQUIRED)
 	if(CUDA_FOUND)
 		message(STATUS "CUDA Found.")
 		include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 		find_library(CUBLAS_LIBRARY cublas ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 		find_library(CUSPARSE_LIBRARY cusparse ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 		find_library(CUSOLVER_LIBRARY cusolver ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 		target_link_libraries(lcg PUBLIC ${CUBLAS_LIBRARY})
 		target_link_libraries(lcg_static ${CUBLAS_LIBRARY})
 		target_link_libraries(lcg PUBLIC ${CUSPARSE_LIBRARY})
 		target_link_libraries(lcg_static ${CUSPARSE_LIBRARY})
 		target_link_libraries(lcg PUBLIC ${CUSOLVER_LIBRARY})
 		target_link_libraries(lcg_static ${CUSOLVER_LIBRARY})
 	endif()
 endif()
 if(LibLCG_OPENMP)
 	# 添加openmp的编译命令 设置编译选项
 	find_package(OpenMP REQUIRED)
 	if (OpenMP_CXX_FOUND)
 		message(STATUS "OpenMP Found.")
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 		set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
 		target_link_libraries(lcg PUBLIC OpenMP::OpenMP_CXX)
 		target_link_libraries(lcg_static OpenMP::OpenMP_CXX)
 	endif()
 endif()
 set(CONFIG_FILE_PATH lib/cmake/${PROJECT_NAME})
 configure_package_config_file(${PROJECT_SOURCE_DIR}/${PROJECT_NAME}Config.cmake.in 
 	${CMAKE_BINARY_DIR}/${PROJECT_NAME}Config.cmake
 	INSTALL_DESTINATION ${CONFIG_FILE_PATH}
 	NO_CHECK_REQUIRED_COMPONENTS_MACRO)
 write_basic_package_version_file(${CMAKE_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
        VERSION ${PROJECT_VERSION}
        COMPATIBILITY SameMajorVersion)
 # 库的安装命令
 if(WIN32)
 	install(TARGETS lcg DESTINATION lib)
 	install(TARGETS lcg_static DESTINATION lib)
 else()
 	install(TARGETS lcg lcg_static
 		EXPORT ${PROJECT_NAME}Targets
 		LIBRARY DESTINATION lib
 		ARCHIVE DESTINATION lib)
 	install(EXPORT ${PROJECT_NAME}Targets
 	        DESTINATION ${CONFIG_FILE_PATH})
 	install(FILES
        ${CMAKE_BINARY_DIR}/${PROJECT_NAME}Config.cmake
        ${CMAKE_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
        DESTINATION ${CONFIG_FILE_PATH})
 endif()
 # 头文件安装命令
 install(FILES lib/config.h DESTINATION include/lcg)
 install(FILES lib/algebra.h DESTINATION include/lcg)
 install(FILES lib/lcg_complex.h DESTINATION include/lcg)
 install(FILES lib/util.h DESTINATION include/lcg)
 install(FILES lib/lcg.h DESTINATION include/lcg)
 install(FILES lib/clcg.h DESTINATION include/lcg)
 install(FILES lib/solver.h DESTINATION include/lcg)
 install(FILES lib/preconditioner.h DESTINATION include/lcg)
 if(LibLCG_CUDA)
 	install(FILES lib/algebra_cuda.h DESTINATION include/lcg)
 	install(FILES lib/lcg_complex_cuda.h DESTINATION include/lcg)
 	install(FILES lib/lcg_cuda.h DESTINATION include/lcg)
 	install(FILES lib/clcg_cuda.h DESTINATION include/lcg)
 	install(FILES lib/clcg_cudaf.h DESTINATION include/lcg)
 	install(FILES lib/solver_cuda.h DESTINATION include/lcg)
 	install(FILES lib/preconditioner_cuda.h DESTINATION include/lcg)
 endif()
 if(LibLCG_EIGEN)
 	install(FILES lib/algebra_eigen.h DESTINATION include/lcg)
 	install(FILES lib/lcg_eigen.h DESTINATION include/lcg)
 	install(FILES lib/clcg_eigen.h DESTINATION include/lcg)
 	install(FILES lib/solver_eigen.h DESTINATION include/lcg)
 	install(FILES lib/preconditioner_eigen.h DESTINATION include/lcg)
 endif()
 # 以下部分为例子程序的编译
 # 设置可执行文件的输出地址
 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 # 例子的编译方法
 macro(add_sample name file)
 	# 添加可执行文件 命令行
 	add_executable(${name} sample/${file})
 	# 为安装文件添加动态库的搜索地址 在Windows下并没有什么用 直接忽略
 	set_target_properties(${name} PROPERTIES INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
 	# 链接动态库
 	target_link_libraries(${name} PUBLIC lcg)
 	# 设置CUDA
 	if(LibLCG_CUDA)
 		set_target_properties(${name} PROPERTIES CUDA_ARCHITECTURES 70)
 	endif()
 endmacro()
 add_sample(lcg_sample1 sample1.cpp)
 add_sample(lcg_sample2 sample2.cpp)
 add_sample(lcg_sample3 sample3.cpp)
 add_sample(lcg_sample4 sample4.cpp)
 if(LibLCG_EIGEN)
 	add_sample(lcg_sample5 sample5.cpp)
 	add_sample(lcg_sample7 sample7.cpp)
 	if(LibLCG_STD_COMPLEX)
 		add_sample(lcg_sample6 sample6.cpp) 
 	endif()
 endif()
 if(LibLCG_CUDA)
 	# The followings are not working for now due to CUDA 12+ compatibility issues. Check more later
 	#add_sample(lcg_sample8 sample8.cu)
 	#add_sample(lcg_sample9 sample9.cu)
 	#add_sample(lcg_sample10 sample10.cu)
 	#add_sample(lcg_sample11 sample11.cu)
 	#add_sample(lcg_sample12 sample12.cu)
 	#add_sample(lcg_sample13 sample13.cu)
 	#add_sample(lcg_sample14 sample14.cu)
 	add_sample(lcg_sample15 sample15.cu)
 endif()
--- a/src/lib/algebra.cpp
+++ b/src/lib/algebra.cpp
@@ -0,0 +1,222 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "ctime"
 #include "random"
 #include "algebra.h"
 #ifdef LibLCG_OPENMP
 #include "omp.h"
 #endif
 lcg_float lcg_abs(lcg_float a)
 {
 	if (a >= 0.0) return a;
 	return -1.0*a;
 }
 lcg_float lcg_max(lcg_float a, lcg_float b)
 {
 	if (a >= b) return a;
 	return b;
 }
 lcg_float lcg_min(lcg_float a, lcg_float b)
 {
 	if (a <= b) return a;
 	return b;
 }
 lcg_float lcg_set2box(lcg_float low, lcg_float hig, lcg_float a, 
 	bool low_bound, bool hig_bound)
 {
 	if (hig_bound && a >= hig) return hig;
 	if (!hig_bound && a >= hig) return (hig - 1e-16);
 	if (low_bound && a <= low) return low;
 	if (!low_bound && a <= low) return (low + 1e-16);
 	return a;
 }
 lcg_float* lcg_malloc(int n)
 {
 	lcg_float* x = new lcg_float [n];
 	return x;
 }
 lcg_float** lcg_malloc(int m, int n)
 {
 	lcg_float **x = new lcg_float* [m];
 	for (int i = 0; i < m; i++)
 	{
 		x[i] = new lcg_float [n];
 	}
 	return x;
 }
 void lcg_free(lcg_float* x)
 {
 	if (x != nullptr)
 	{
 		delete[] x;
 		x = nullptr;
 	}
 	return;
 }
 void lcg_free(lcg_float **x, int m)
 {
 	if (x != nullptr)
 	{
 		for (int i = 0; i < m; i++)
 		{
 			delete[] x[i];
 		}
 		delete[] x;
 		x = nullptr;
 	}
 	return;
 }
 void lcg_vecset(lcg_float *a, lcg_float b, int size)
 {
 	for (int i = 0; i < size; i++)
 	{
 		a[i] = b;
 	}
 	return;
 }
 void lcg_vecset(lcg_float **a, lcg_float b, int m, int n)
 {
    for (int i = 0; i < m; ++i)
    {
        for (int j = 0; j < n; ++j)
        {
            a[i][j] = b;
        }
    }
    return;
 }
 void lcg_vecrnd(lcg_float *a, lcg_float l, lcg_float h, int size)
 {
 	srand(time(nullptr));
 	for (int i = 0; i < size; i++)
 	{
 		a[i] = (h-l)*rand()*1.0/RAND_MAX + l;
 	}
 	return;
 }
 void lcg_vecrnd(lcg_float **a, lcg_float l, lcg_float h, int m, int n)
 {
 	srand(time(nullptr));
 	for (int i = 0; i < m; i++)
 	{
 		for (int j = 0; j < n; j++)
 		{
 			a[i][j] = (h-l)*rand()*1.0/RAND_MAX + l;	
 		}
 	}
 	return;
 }
 double lcg_squaredl2norm(lcg_float *a, int n)
 {
 	lcg_float sum = 0;
 	for (size_t i = 0; i < n; i++)
 	{
 		sum += a[i]*a[i];
 	}
 	return sum;
 }
 void lcg_dot(lcg_float &ret, const lcg_float *a, 
 	const lcg_float *b, int size)
 {
 	ret = 0.0;
 	for (int i = 0; i < size; i++)
 	{
 		ret += a[i]*b[i];
 	}
 	return;
 }
 void lcg_matvec(lcg_float **A, const lcg_float *x, lcg_float *Ax, 
 	int m_size, int n_size, lcg_matrix_e layout)
 {
 	int i, j;
 	if (layout == MatNormal)
 	{
 #pragma omp parallel for private (i, j) schedule(guided)
 		for (i = 0; i < m_size; i++)
 		{
 			Ax[i] = 0.0;
 			for (j = 0; j < n_size; j++)
 			{
 				Ax[i] += A[i][j]*x[j];
 			}
 		}
 		return;
 	}
 #pragma omp parallel for private (i, j) schedule(guided)
 	for (j = 0; j < n_size; j++)
 	{
 		Ax[j] = 0.0;
 		for (i = 0; i < m_size; i++)
 		{
 			Ax[j] += A[i][j]*x[i];
 		}
 	}
 	return;
 }
 void lcg_matvec_coo(const int *row, const int *col, const lcg_float *Mat, const lcg_float *V, lcg_float *p, int M, int N, int nz_size, bool pre_position)
 {
 	if (!pre_position)
 	{
 		for (size_t i = 0; i < M; i++)
 		{
 			p[i] = 0.0;
 		}
 		for (size_t i = 0; i < nz_size; i++)
 		{
 			p[row[i]] += Mat[i]*V[col[i]];
 		}
 	}
 	else
 	{
 		for (size_t i = 0; i < N; i++)
 		{
 			p[i] = 0.0;
 		}
 		for (size_t i = 0; i < nz_size; i++)
 		{
 			p[col[i]] += Mat[i]*V[row[i]];
 		}
 	}
 	return;
 }
--- a/src/lib/algebra.h
+++ b/src/lib/algebra.h
@@ -0,0 +1,219 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _ALGEBRA_H
 #define _ALGEBRA_H
 #include "config.h"
 /**
 * @brief      Matrix layouts.
 */
 enum lcg_matrix_e
 {
 	MatNormal,
 	MatTranspose,
 };
 /**
 * @brief      Conjugate types for a complex number.
 */
 enum clcg_complex_e
 {
 	NonConjugate,
 	Conjugate,
 };
 /**
 * @brief      A simple definition of the float type we use here. 
 * Easy to change in the future. Right now it is just an alias of double
 */
 typedef double lcg_float;
 /**
 * @brief      Return absolute value
 *
 * @param[in]  a     input value
 *
 * @return     The absolute value
 */
 lcg_float lcg_abs(lcg_float a);
 /**
 * @brief      Return the bigger value
 *
 * @param[in]  a     input value
 * @param[in]  b     input value
 *
 * @return     The bigger value
 */
 lcg_float lcg_max(lcg_float a, lcg_float b);
 /**
 * @brief      Return the smaller value
 *
 * @param[in]  a     input value
 * @param[in]  b     input value
 *
 * @return     The smaller value
 */
 lcg_float lcg_min(lcg_float a, lcg_float b);
 /**
 * @brief      Set the input value within a box constraint
 *
 * @param      a     low boundary
 * @param      b     high boundary
 * @param      in    input value
 * @param      low_bound    Whether to include the low boundary value
 * @param      hig_bound    Whether to include the high boundary value
 *
 * @return     box constrained value
 */
 lcg_float lcg_set2box(lcg_float low, lcg_float hig, lcg_float a, 
 	bool low_bound = true, bool hig_bound = true);
 /**
 * @brief      Locate memory for a lcg_float pointer type.
 *
 * @param[in]  n     Size of the lcg_float array.
 *
 * @return     Pointer of the array's location.
 */
 lcg_float* lcg_malloc(int n);
 /**
 * @brief      Locate memory for a lcg_float second pointer type.
 *
 * @param[in]  n     Size of the lcg_float array.
 *
 * @return     Pointer of the array's location.
 */
 lcg_float** lcg_malloc(int m, int n);
 /**
 * @brief      Destroy memory used by the lcg_float type array.
 *
 * @param      x     Pointer of the array.
 */
 void lcg_free(lcg_float* x);
 /**
 * @brief      Destroy memory used by the 2D lcg_float type array.
 *
 * @param      x     Pointer of the array.
 */
 void lcg_free(lcg_float **x, int m);
 /**
 * @brief      set a vector's value
 *
 * @param      a     pointer of the vector
 * @param[in]  b     initial value
 * @param[in]  size  vector size
 */
 void lcg_vecset(lcg_float *a, lcg_float b, int size);
 /**
 * @brief      set a 2d vector's value
 *
 * @param      a     pointer of the matrix
 * @param[in]  b     initial value
 * @param[in]  m     row size of the matrix
 * @param[in]  n     column size of the matrix
 */
 void lcg_vecset(lcg_float **a, lcg_float b, int m, int n);
 /**
 * @brief      set a vector using random values
 *
 * @param      a     pointer of the vector
 * @param[in]  l     the lower bound of random values
 * @param[in]  h     the higher bound of random values
 * @param[in]  size  size of the vector
 */
 void lcg_vecrnd(lcg_float *a, lcg_float l, lcg_float h, int size);
 /**
 * @brief      set a 2D vector using random values
 *
 * @param      a     pointer of the vector
 * @param[in]  l     the lower bound of random values
 * @param[in]  h     the higher bound of random values
 * @param[in]  m     row size of the vector
 * @param[in]  n     column size of the vector
 */
 void lcg_vecrnd(lcg_float **a, lcg_float l, lcg_float h, int m, int n);
 /**
 * @brief    calculate the squared L2 norm of the input vector
 * 
 * @param a         pointer of the vector
 * @param n         size of the vector
 * @return double   L2 norm
 */
 double lcg_squaredl2norm(lcg_float *a, int n);
 /**
 * @brief      calculate dot product of two real vectors
 *
 * @param[in]  a       pointer of the vector a
 * @param[in]  b       pointer of the vector b
 * @param[in]  size    size of the vector
 *
 * @return     dot product
 */
 void lcg_dot(lcg_float &ret, const lcg_float *a, const lcg_float *b, int size);
 /**
 * @brief      calculate product of a real matrix and a vector
 * 
 * Different configurations:
 * layout=Normal -> A
 * layout=Transpose -> A^T
 *
 * @param      A          matrix A
 * @param[in]  x          vector x
 * @param      Ax         product of Ax
 * @param[in]  m_size     row size of A
 * @param[in]  n_size     column size of A
 * @param[in]  layout     layout of A used for multiplication. Must be Normal or Transpose
 */
 void lcg_matvec(lcg_float **A, const lcg_float *x, lcg_float *Ax, int m_size, int n_size, 
 	lcg_matrix_e layout = MatNormal);
 /**
 * @brief     Calculate the product of a sparse matrix multipled by a vector. The matrix is stored in the COO format.
 * 
 * @param row             Row index of the input sparse matrix.
 * @param col             Column index of the input sparse matrix.
 * @param Mat             Non-zero values of the input sparse matrix.
 * @param V               Multipler vector
 * @param p               Output prodcut
 * @param M               Row number of the sparse matrix
 * @param N               Column number of the sparse matrix
 * @param nz_size         Non-zero size of the matrix
 * @param pre_position    If ture, the multipler is seen as a row vector. Otherwise, it is treated as a column vector.
 */
 void lcg_matvec_coo(const int *row, const int *col, const lcg_float *Mat, const lcg_float *V, lcg_float *p, int M, int N, int nz_size, bool pre_position = false);
 #endif //_ALGEBRA_H
--- a/src/lib/algebra_cuda.cu
+++ b/src/lib/algebra_cuda.cu
@@ -0,0 +1,110 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "algebra_cuda.h"
 __global__ void lcg_set2box_cuda_device(const lcg_float *low, const lcg_float *hig, lcg_float *a, 
    int n, bool low_bound, bool hig_bound)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		if (hig_bound && a[i] >= hig[i]) a[i] = hig[i];
 		if (!hig_bound && a[i] > hig[i]) a[i] = hig[i];
 		if (low_bound && a[i] <= low[i]) a[i] = low[i];
 		if (!low_bound && a[i] < low[i]) a[i] = low[i];
 	}
 	return;
 }
 __global__ void lcg_smDcsr_get_diagonal_device(const int *A_ptr, const int *A_col, const lcg_float *A_val, const int A_len, lcg_float *A_diag)
 {
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < A_len)
 	{
 		const int num_non0_row = A_ptr[i + 1] - A_ptr[i];
 		for (int j = 0; j < num_non0_row; j++)
 		{
 			if (A_col[j + A_ptr[i]] == i)
 			{
 				A_diag[i] = A_val[j + A_ptr[i]];
 				break;
 			}
 		}
 	}
 	return;
 }
 __global__ void lcg_vecMvecD_element_wise_device(const lcg_float *a, const lcg_float *b, lcg_float *c, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		c[i] = a[i] * b[i];
 	}
 	return;
 }
 __global__ void lcg_vecDvecD_element_wise_device(const lcg_float *a, const lcg_float *b, lcg_float *c, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		c[i] = a[i] / b[i];
 	}
 	return;
 }
 void lcg_set2box_cuda(const lcg_float *low, const lcg_float *hig, lcg_float *a, 
    int n, bool low_bound, bool hig_bound)
 {
 	int blockSize = 1024;
 	int numBlocks = (n+ blockSize - 1) / blockSize;
 	lcg_set2box_cuda_device<<<numBlocks, blockSize>>>(low, hig, a, n, low_bound, hig_bound);
 	return;
 }
 void lcg_smDcsr_get_diagonal(const int *A_ptr, const int *A_col, const lcg_float *A_val, const int A_len, lcg_float *A_diag, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (A_len+ blockSize - 1) / blockSize;
 	lcg_smDcsr_get_diagonal_device<<<numBlocks, blockSize>>>(A_ptr, A_col, A_val, A_len, A_diag);
 	return;
 }
 void lcg_vecMvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	lcg_vecMvecD_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
 	return;
 }
 void lcg_vecDvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	lcg_vecDvecD_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
 	return;
 }
--- a/src/lib/algebra_cuda.h
+++ b/src/lib/algebra_cuda.h
@@ -0,0 +1,88 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _ALGEBRA_CUDA_H
 #define _ALGEBRA_CUDA_H
 #include "algebra.h"
 #ifdef LibLCG_CUDA
 #include <cuda_runtime.h>
 /**
 * @brief      Set the input value within a box constraint
 *
 * @param      a     low boundary
 * @param      b     high boundary
 * @param      in    input value
 * @param      low_bound    Whether to include the low boundary value
 * @param      hig_bound    Whether to include the high boundary value
 *
 * @return     box constrained value
 */
 void lcg_set2box_cuda(const lcg_float *low, const lcg_float *hig, lcg_float *a, 
    int n, bool low_bound = true, bool hig_bound = true);
 /**
 * @brief      Extract diagonal elements from a square CUDA sparse matrix that is formatted in the CSR format
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  A_ptr   Row index pointer
 * @param[in]  A_col   Column index
 * @param[in]  A_val   Non-zero values of the matrix
 * @param[in]  A_len   Dimension of the matrix
 * @param      A_diag  Output digonal elements
 * @param[in]  bk_size Default CUDA block size.
 */
 void lcg_smDcsr_get_diagonal(const int *A_ptr, const int *A_col, const lcg_float *A_val, const int A_len, lcg_float *A_diag, int bk_size = 1024);
 /**
 * @brief      Element-wise muplication between two CUDA arries.
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  a     Pointer of the input array
 * @param[in]  b     Pointer of the input array
 * @param      c     Pointer of the output array
 * @param[in]  n     Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void lcg_vecMvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size = 1024);
 /**
 * @brief      Element-wise division between two CUDA arries.
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  a     Pointer of the input array
 * @param[in]  b     Pointer of the input array
 * @param      c     Pointer of the output array
 * @param[in]  n     Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void lcg_vecDvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size = 1024);
 #endif // LibLCG_CUDA
 #endif //_ALGEBRA_CUDA_H
--- a/src/lib/algebra_eigen.cpp
+++ b/src/lib/algebra_eigen.cpp
@@ -0,0 +1,32 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "algebra_eigen.h"
 void lcg_set2box_eigen(const Eigen::VectorXd &low, const Eigen::VectorXd &hig, Eigen::VectorXd m)
 {
 	for (int i = 0; i < m.size(); i++)
 	{
 		m[i] = lcg_set2box(low[i], hig[i], m[i]);
 	}
 	return;
 }
--- a/src/lib/algebra_eigen.h
+++ b/src/lib/algebra_eigen.h
@@ -0,0 +1,43 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _ALGEBRA_EIGEN_H
 #define _ALGEBRA_EIGEN_H
 #include "algebra.h"
 #ifdef LibLCG_EIGEN
 #include "Eigen/Dense"
 /**
 * @brief      Set the input value within a box constraint
 *
 * @param      low_bound    Whether to include the low boundary value
 * @param      hig_bound    Whether to include the high boundary value
 * @param      m            Returned values
 */
 void lcg_set2box_eigen(const Eigen::VectorXd &low, const Eigen::VectorXd &hig, Eigen::VectorXd m);
 #endif // LibLCG_EIGEN
 #endif // _ALGEBRA_EIGEN_H
--- a/src/lib/clcg.cpp
+++ b/src/lib/clcg.cpp
@@ -0,0 +1,837 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "clcg.h"
 #include "cmath"
 #include "config.h"
 #ifdef LibLCG_OPENMP
 #include "omp.h"
 #endif
 typedef int (*clcg_solver_ptr)(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, 
 	const lcg_complex* B, const int n_size, const clcg_para* param, void* instance);
 int clbicg(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance);
 int clbicg_symmetric(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance);
 int clcgs(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance);
 int clbicgstab(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance);
 int cltfqmr(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance);
 int clcg_solver(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, 
 	const lcg_complex* B, const int n_size, const clcg_para* param, void* instance, 
 	clcg_solver_enum solver_id)
 {
 	clcg_solver_ptr cg_solver;
 	switch (solver_id)
 	{
 		case CLCG_BICG:
 			cg_solver = clbicg;
 			break;
 		case CLCG_BICG_SYM:
 			cg_solver = clbicg_symmetric;
 			break;
 		case CLCG_CGS:
 			cg_solver = clcgs;
 			break;
 		case CLCG_BICGSTAB:
 			cg_solver = clbicgstab;
 			break;
 		case CLCG_TFQMR:
 			cg_solver = cltfqmr;
 			break;
 		default:
 			cg_solver = clcgs;
 			break;
 	}
 	return cg_solver(Afp, Pfp, m, B, n_size, param, instance);
 }
 int clbicg(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
 	int i;
 	lcg_complex *r1k = nullptr, *r2k = nullptr, *d1k = nullptr, *d2k = nullptr;
 	lcg_complex *Ax = nullptr;
 	r1k = clcg_malloc(n_size); r2k = clcg_malloc(n_size);
 	d1k = clcg_malloc(n_size); d2k = clcg_malloc(n_size);
 	Ax  = clcg_malloc(n_size);
 	lcg_complex ak, Ad1d2, r1r2_next, betak;
 	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 	for (i = 0; i < n_size; i++)
 	{
 		d1k[i] = r1k[i] = B[i] - Ax[i];
 		d2k[i] = r2k[i] = clcg_conjugate(&r1k[i]);
 	}
 	lcg_complex r1r2;
 	clcg_inner(r1r2, r2k, r1k, n_size);
 	lcg_float r0_square, rk_square;
 	lcg_complex r0_mod, rk_mod;
 	clcg_inner(rk_mod, r1k, r1k, n_size);
 	r0_square = rk_square = clcg_square(&rk_mod);
 	if (r0_square < 1.0) r0_square = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_square/r0_square <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
 		else residual = rk_square/r0_square;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, m, residual, &para, n_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, d1k, Ax, n_size, MatNormal, NonConjugate);
 		clcg_inner(Ad1d2, d2k, Ax, n_size);
 		ak = r1r2/Ad1d2;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			m[i] = m[i] + ak*d1k[i];
 			r1k[i] = r1k[i] - ak*Ax[i];
 		}
 		clcg_inner(rk_mod, r1k, r1k, n_size);
 		rk_square = clcg_square(&rk_mod);
 		Afp(instance, d2k, Ax, n_size, MatTranspose, Conjugate);
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			r2k[i] = r2k[i] - clcg_conjugate(&ak)*Ax[i];
 		}
 		for (i = 0; i < n_size; i++)
 		{
 			if (m[i] != m[i])
 			{
 				ret = CLCG_NAN_VALUE; goto func_ends;
 			}
 		}
 		clcg_inner(r1r2_next, r2k, r1k, n_size);
 		betak = r1r2_next/r1r2;
 		r1r2 = r1r2_next;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			d1k[i] = r1k[i] + betak*d1k[i];
 			d2k[i] = r2k[i] + clcg_conjugate(&betak)*d2k[i];
 		}
 	}
 	func_ends:
 	{
 		clcg_free(r1k);
 		clcg_free(r2k);
 		clcg_free(d1k);
 		clcg_free(d2k);
 		clcg_free(Ax);
 	}
 	return ret;
 }
 int clbicg_symmetric(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
 	int i;
 	lcg_complex *rk = nullptr, *dk = nullptr;
 	lcg_complex *Ax = nullptr;
 	rk = clcg_malloc(n_size); dk = clcg_malloc(n_size);
 	Ax = clcg_malloc(n_size);
 	lcg_complex ak, rkrk2, betak, dkAx;
 	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 	for (i = 0; i < n_size; i++)
 	{
 		dk[i] = rk[i] = B[i] - Ax[i];
 	}
 	lcg_complex rkrk;
 	clcg_dot(rkrk, rk, rk, n_size);
 	lcg_float r0_square, rk_square;
 	lcg_complex r0_mod, rk_mod;
 	clcg_inner(rk_mod, rk, rk, n_size);
 	r0_square = rk_square = clcg_square(&rk_mod);
 	if (r0_square < 1.0) r0_square = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_square/r0_square <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
 		else residual = rk_square/r0_square;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, m, residual, &para, n_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, dk, Ax, n_size, MatNormal, NonConjugate);
 		clcg_dot(dkAx, dk, Ax, n_size);
 		ak = rkrk/dkAx;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			m[i] = m[i] + ak*dk[i];
 			rk[i] = rk[i] - ak*Ax[i];
 		}
 		clcg_inner(rk_mod, rk, rk, n_size);
 		rk_square = clcg_square(&rk_mod);
 		for (i = 0; i < n_size; i++)
 		{
 			if (m[i] != m[i])
 			{
 				ret = CLCG_NAN_VALUE; goto func_ends;
 			}
 		}
 		clcg_dot(rkrk2, rk, rk, n_size);
 		betak = rkrk2/rkrk;
 		rkrk = rkrk2;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			dk[i] = rk[i] + betak*dk[i];
 		}
 	}
 	func_ends:
 	{
 		clcg_free(rk);
 		clcg_free(dk);
 		clcg_free(Ax);
 	}
 	return ret;
 }
 int clcgs(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
 	int i;
 	lcg_complex *rk = nullptr, *rbar0 = nullptr, *pk = nullptr;
 	lcg_complex *Ax = nullptr, *uk = nullptr, *qk = nullptr, *wk = nullptr; // w_k = u_{k-1} + q_k
 	rk = clcg_malloc(n_size); rbar0 = clcg_malloc(n_size);
 	pk = clcg_malloc(n_size); Ax  = clcg_malloc(n_size);
 	uk = clcg_malloc(n_size); qk  = clcg_malloc(n_size);
 	wk = clcg_malloc(n_size);
 	lcg_complex ak, rhok2, sigma, betak;
 	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 	for (i = 0; i < n_size; i++)
 	{
 		pk[i] = uk[i] = rk[i] = B[i] - Ax[i];
 	}
 	lcg_complex rhok;
 	do
 	{
 		clcg_vecrnd(rbar0, lcg_complex(1.0, 0.0), lcg_complex(2.0, 0.0), n_size);
 		clcg_inner(rhok, rbar0, rk, n_size);
 	} while (clcg_module(&rhok) < 1e-8);
 	lcg_float r0_square, rk_square;
 	lcg_complex r0_mod, rk_mod;
 	clcg_inner(rk_mod, rk, rk, n_size);
 	r0_square = rk_square = clcg_square(&rk_mod);
 	if (r0_square < 1.0) r0_square = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_square/r0_square <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
 		else residual = rk_square/r0_square;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, m, residual, &para, n_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, pk, Ax, n_size, MatNormal, NonConjugate); // vk = Apk
 		clcg_inner(sigma, rbar0, Ax, n_size);
 		ak = rhok/sigma;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			qk[i] = uk[i] - ak*Ax[i];
 			wk[i] = uk[i] + qk[i];
 		}
 		Afp(instance, wk, Ax, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			m[i] = m[i] + ak*wk[i];
 			rk[i] = rk[i] - ak*Ax[i];
 		}
 		clcg_inner(rk_mod, rk, rk, n_size);
 		rk_square = clcg_square(&rk_mod);
 		for (i = 0; i < n_size; i++)
 		{
 			if (m[i] != m[i])
 			{
 				ret = CLCG_NAN_VALUE; goto func_ends;
 			}
 		}
 		clcg_inner(rhok2, rbar0, rk, n_size);
 		betak = rhok2/rhok;
 		rhok = rhok2;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			uk[i] = rk[i] + betak*qk[i];
 			pk[i] = uk[i] + betak*(qk[i] + betak*pk[i]);
 		}
 	}
 	func_ends:
 	{
 		clcg_free(rk);
 		clcg_free(rbar0);
 		clcg_free(pk);
 		clcg_free(Ax);
 		clcg_free(uk);
 		clcg_free(qk);
 		clcg_free(wk);
 	}
 	return ret;
 }
 int clbicgstab(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance)
 {
 	// set BICGSTAB parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
 	int i;
 	lcg_complex *rk = nullptr, *rbar0 = nullptr, *pk = nullptr, *sk = nullptr;
 	lcg_complex *Ap = nullptr, *As = nullptr;
 	rk = clcg_malloc(n_size); rbar0 = clcg_malloc(n_size);
 	pk = clcg_malloc(n_size); sk = clcg_malloc(n_size);
 	Ap = clcg_malloc(n_size); As = clcg_malloc(n_size);
 	lcg_complex ak, rhok2, sigma, omega, betak, Ass, AsAs;
 	Afp(instance, m, Ap, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 	for (i = 0; i < n_size; i++)
 	{
 		pk[i] = rk[i] = B[i] - Ap[i];
 	}
 	lcg_complex rhok;
 	do
 	{
 		clcg_vecrnd(rbar0, lcg_complex(1.0, 0.0), lcg_complex(2.0, 0.0), n_size);
 		clcg_inner(rhok, rbar0, rk, n_size);
 	} while (clcg_module(&rhok) < 1e-8);
 	lcg_float r0_square, rk_square;
 	lcg_complex r0_mod, rk_mod;
 	clcg_inner(rk_mod, rk, rk, n_size);
 	r0_square = rk_square = clcg_square(&rk_mod);
 	if (r0_square < 1.0) r0_square = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_square/r0_square <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
 		else residual = rk_square/r0_square;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, m, residual, &para, n_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, pk, Ap, n_size, MatNormal, NonConjugate);
 		clcg_inner(sigma, rbar0, Ap, n_size);
 		ak = rhok/sigma;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			sk[i] = rk[i] - ak*Ap[i];
 		}
 		Afp(instance, sk, As, n_size, MatNormal, NonConjugate);
 		clcg_inner(Ass, As, sk, n_size);
 		clcg_inner(AsAs, As, As, n_size);
 		omega = Ass/AsAs;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			m[i] = m[i] + ak*pk[i] + omega*sk[i];
 			rk[i] = sk[i] - omega*As[i];
 		}
 		clcg_inner(rk_mod, rk, rk, n_size);
 		rk_square = clcg_square(&rk_mod);
 		for (i = 0; i < n_size; i++)
 		{
 			if (m[i] != m[i])
 			{
 				ret = CLCG_NAN_VALUE; goto func_ends;
 			}
 		}
 		clcg_inner(rhok2, rbar0, rk, n_size);
 		betak = rhok2*ak/(rhok*omega);
 		rhok = rhok2;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			pk[i] = rk[i] + betak*(pk[i] - omega*Ap[i]);
 		}
 	}
 	func_ends:
 	{
 		clcg_free(rk);
 		clcg_free(rbar0);
 		clcg_free(pk);
 		clcg_free(sk);
 		clcg_free(Ap);
 		clcg_free(As);
 	}
 	return ret;
 }
 int cltfqmr(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
 	const int n_size, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
 	int i, j;
 	lcg_complex *pk = nullptr, *uk = nullptr;
 	lcg_complex *vk = nullptr, *dk = nullptr;
 	lcg_complex *rbar0 = nullptr, *rk = nullptr;
 	lcg_complex *Ax = nullptr, *qk = nullptr;
 	lcg_complex *uqk = nullptr;
 	pk = clcg_malloc(n_size); uk = clcg_malloc(n_size);
 	vk = clcg_malloc(n_size); dk = clcg_malloc(n_size);
 	rbar0 = clcg_malloc(n_size); rk = clcg_malloc(n_size);
 	Ax = clcg_malloc(n_size); qk = clcg_malloc(n_size);
 	uqk = clcg_malloc(n_size);
 	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 	for (i = 0; i < n_size; i++)
 	{
 		pk[i] = uk[i] = rk[i] = B[i] - Ax[i];
 		clcg_set(&dk[i], 0.0, 0.0);
 	}
 	lcg_complex rho, rk_mod, rk_mod2;
 	lcg_float r0_square, rk_square;
 	clcg_inner(rk_mod, rk, rk, n_size);
 	r0_square = rk_square = clcg_square(&rk_mod);
 	if (r0_square < 1.0) r0_square = 1.0;
 	do
 	{
 		clcg_vecrnd(rbar0, lcg_complex(1.0, 0.0), lcg_complex(2.0, 0.0), n_size);
 		clcg_inner(rho, rbar0, rk, n_size);
 	} while (clcg_module(&rho) < 1e-8);
 	lcg_float theta = 0.0, omega = clcg_module(&rk_mod);
 	lcg_float residual, tao = omega;
 	lcg_complex sigma, alpha, betak, rho2, sign, eta(0.0, 0.0);
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_square/r0_square <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
 		}
 		goto func_ends;
 	}
 	while(1)
 	{
 		Afp(instance, pk, vk, n_size, MatNormal, NonConjugate);
 		clcg_inner(sigma, rbar0, vk, n_size);
 		alpha = rho/sigma;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			qk[i] = uk[i] - alpha*vk[i];
 			uqk[i] = uk[i] + qk[i];
 		}
 		Afp(instance, uqk, Ax, n_size, MatNormal, NonConjugate);
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			rk[i] = rk[i] - alpha*Ax[i];
 		}
 		clcg_inner(rk_mod2, rk, rk, n_size);
 		for (j = 1; j <= 2; j++)
 		{
 			if (para.abs_diff) residual = sqrt(rk_square)/n_size;
 			else residual = rk_square/r0_square;
 			if (Pfp != nullptr)
 			{
 				if (Pfp(instance, m, residual, &para, n_size, t))
 				{
 					ret = CLCG_STOP; goto func_ends;
 				}
 			}
 			if (residual <= para.epsilon)
 			{
 				ret = CLCG_CONVERGENCE; goto func_ends;
 			}
 			if (para.max_iterations > 0 && t+1 > para.max_iterations)
 			{
 				ret = LCG_REACHED_MAX_ITERATIONS;
 				break;
 			}
 			t++;
 			sign = theta*theta*(eta/alpha);
 			if (j == 1)
 			{
 				omega = sqrt(clcg_module(&rk_mod)*clcg_module(&rk_mod2));
 #pragma omp parallel for private (i) schedule(guided)
 				for (i = 0; i < n_size; i++)
 				{
 					dk[i] = uk[i] + sign*dk[i];
 				}
 			}
 			else
 			{
 				omega = clcg_module(&rk_mod2);
 #pragma omp parallel for private (i) schedule(guided)
 				for (i = 0; i < n_size; i++)
 				{
 					dk[i] = qk[i] + sign*dk[i];
 				}
 			}
 			theta = omega/tao;
 			tao = omega/sqrt(1.0+theta*theta);
 			eta = (1.0/(1.0+theta*theta))*alpha;
 #pragma omp parallel for private (i) schedule(guided)
 			for (i = 0; i < n_size; i++)
 			{
 				m[i] = m[i] + eta*dk[i];
 			}
 			for (i = 0; i < n_size; i++)
 			{
 				if (m[i] != m[i])
 				{
 					ret = CLCG_NAN_VALUE; goto func_ends;
 				}
 			}
 		}
 		rk_mod = rk_mod2;
 		rk_square = clcg_square(&rk_mod);
 		clcg_inner(rho2, rbar0, rk, n_size);
 		betak = rho2/rho;
 		rho = rho2;
 #pragma omp parallel for private (i) schedule(guided)
 		for (i = 0; i < n_size; i++)
 		{
 			uk[i] = rk[i] + betak*qk[i];
 			pk[i] = uk[i] + betak*(qk[i] + betak*pk[i]);
 		}
 	}
 	func_ends:
 	{
 		clcg_free(pk);
 		clcg_free(uk);
 		clcg_free(vk);
 		clcg_free(dk);
 		clcg_free(rbar0);
 		clcg_free(rk);
 		clcg_free(Ax);
 		clcg_free(qk);
 		clcg_free(uqk);
 	}
 	return ret;
 }
--- a/src/lib/clcg.h
+++ b/src/lib/clcg.h
@@ -0,0 +1,78 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _CLCG_H
 #define _CLCG_H
 #include "lcg_complex.h"
 #include "util.h"
 /**
 * @brief  Callback interface for calculating the complex product of a N*N matrix 'A' multiplied 
 * by a complex vertical vector 'x'.
 * 
 * @param  instance    The user data sent for the clcg_solver() functions by the client.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 * @param  x_size      Size of x and column/row numbers of A.
 * @param  layout      Whether to use the transpose of A for calculation.
 * @param  conjugate   Whether to use the conjugate of A for calculation.
 */
 typedef void (*clcg_axfunc_ptr)(void *instance, const lcg_complex *x, lcg_complex *prod_Ax, 
 	const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary.
 * 
 * @param    instance    The user data sent for the clcg_solver() functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    n_size      The size of the variables
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*clcg_progress_ptr)(void* instance, const lcg_complex* m, 
 	const lcg_float converge, const clcg_para* param, const int n_size, const int k);
 /**
 * @brief      A combined complex conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int clcg_solver(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, 
 	const lcg_complex* B, const int n_size, const clcg_para* param, void* instance, 
 	clcg_solver_enum solver_id = CLCG_BICG);
 #endif // _CLCG_H
--- a/src/lib/clcg_cuda.cu
+++ b/src/lib/clcg_cuda.cu
@@ -0,0 +1,529 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "cmath"
 #include "ctime"
 #include "iostream"
 #include "clcg_cuda.h"
 typedef int (*cuda_solver_ptr)(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clbicg(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clbicg_symmetric(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clcg_solver_cuda(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, const cuDoubleComplex* B, 
    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
    cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
 {
    cuda_solver_ptr cg_solver;
    switch (solver_id)
 	{
 		case CLCG_BICG:
 			cg_solver = clbicg;
 			break;
 		case CLCG_BICG_SYM:
 			cg_solver = clbicg_symmetric;
 			break;
 		default:
 			return CLCG_UNKNOWN_SOLVER;
 	}
 	return cg_solver(Afp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 typedef int (*cuda_precondtioned_solver_ptr)(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, 
    cuDoubleComplex* m, const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, 
    void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clpcg(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clcg_solver_preconditioned_cuda(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, 
    cuDoubleComplex* m, const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
 {
    cuda_precondtioned_solver_ptr cgp_solver;
    switch (solver_id)
 	{
 		case CLCG_PCG:
 			cgp_solver = clpcg; break;
 		default:
 			return CLCG_UNKNOWN_SOLVER;
 	}
 	return cgp_solver(Afp, Mfp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 int clbicg(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
    // set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	cuDoubleComplex *d_m = nullptr, *d_B = nullptr;
 	cuDoubleComplex *r1k = nullptr, *r2k = nullptr;
 	cuDoubleComplex *d1k = nullptr, *d2k = nullptr, *Ax = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&r1k, n_size * sizeof(cuDoubleComplex));
 	cudaMalloc(&r2k, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&d1k, n_size * sizeof(cuDoubleComplex));
 	cudaMalloc(&d2k, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&Ax, n_size * sizeof(cuDoubleComplex));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    cusparseDnVecDescr_t dvec_m, dvec_d1k, dvec_d2k, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_d1k, n_size, d1k, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_d2k, n_size, d2k, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_64F);
    cuDoubleComplex one, none;
    one.x = 1.0; one.y = 0.0;
    none.x = -1.0; none.y = 0.0;
 	cuDoubleComplex ak, nak, conj_ak, Ad1d2, r1r2_next, betak, conj_betak;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
    // r0 = B - Ax
    cudaMemcpy(r1k, d_B, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // r0 = B
    cublasZaxpy_v2(cub_handle, n_size, &none, Ax, 1, r1k, 1); // r0 -= Ax
    cudaMemcpy(d1k, r1k, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // d0 = r0
 	clcg_vecZ_conjugate(r1k, r2k, n_size);
 	cudaMemcpy(d2k, r2k, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice);
 	cuDoubleComplex r1r2;
    cublasZdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2);
 	lcg_float rk_mod;
 	cublasDznrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = rk_mod/n_size;
 		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_d1k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
        cublasZdotc_v2(cub_handle, n_size, d2k, 1, Ax, 1, &Ad1d2);
        ak = cuCdiv(r1r2, Ad1d2);
        nak = cuCmul(none, ak);
 		conj_ak = cuConj(nak);
        cublasZaxpy_v2(cub_handle, n_size, &ak, d1k, 1, d_m, 1);
        cublasZaxpy_v2(cub_handle, n_size, &nak, Ax, 1, r1k, 1);
        cublasDznrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
 		Afp(instance, cub_handle, cus_handle, dvec_d2k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE);
 		cublasZaxpy_v2(cub_handle, n_size, &conj_ak, Ax, 1, r2k, 1);
 		cublasZdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2_next);
 		betak = cuCdiv(r1r2_next, r1r2);
 		conj_betak = cuConj(betak);
 		r1r2 = r1r2_next;
        cublasZscal_v2(cub_handle, n_size, &betak, d1k, 1);
        cublasZaxpy_v2(cub_handle, n_size, &one, r1k, 1, d1k, 1);
 		cublasZscal_v2(cub_handle, n_size, &conj_betak, d2k, 1);
        cublasZaxpy_v2(cub_handle, n_size, &one, r2k, 1, d2k, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(r1k);
 		cudaFree(r2k);
 		cudaFree(d1k);
 		cudaFree(d2k);	
 		cudaFree(Ax);
        cusparseDestroyDnVec(dvec_m);
        cusparseDestroyDnVec(dvec_d1k);
 		cusparseDestroyDnVec(dvec_d2k);
        cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
 int clbicg_symmetric(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
    // set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	cuDoubleComplex *d_m = nullptr, *d_B = nullptr;
 	cuDoubleComplex *rk = nullptr, *dk = nullptr, *Ax = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&rk, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&dk, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&Ax, n_size * sizeof(cuDoubleComplex));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    cusparseDnVecDescr_t dvec_m, dvec_dk, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_64F);
    cuDoubleComplex one, none;
    one.x = 1.0; one.y = 0.0;
    none.x = -1.0; none.y = 0.0;
 	cuDoubleComplex ak, nak, rkrk2, betak, dkAx;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
    // r0 = B - Ax
    cudaMemcpy(rk, d_B, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // r0 = B
    cublasZaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
    cudaMemcpy(dk, rk, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // d0 = r0
 	cuDoubleComplex rkrk;
    cublasZdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk);
 	lcg_float rk_mod;
 	cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = rk_mod/n_size;
 		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
        cublasZdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
        ak = cuCdiv(rkrk, dkAx);
        nak = cuCmul(none, ak);
        cublasZaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
        cublasZaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
        cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
 		cublasZdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk2);
 		betak = cuCdiv(rkrk2, rkrk);
 		rkrk = rkrk2;
        cublasZscal_v2(cub_handle, n_size, &betak, dk, 1);
        cublasZaxpy_v2(cub_handle, n_size, &one, rk, 1, dk, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(rk);
 		cudaFree(dk);
 		cudaFree(Ax);
        cusparseDestroyDnVec(dvec_m);
        cusparseDestroyDnVec(dvec_dk);
        cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
 int clpcg(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
    // set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	cuDoubleComplex *d_m = nullptr, *d_B = nullptr;
    cuDoubleComplex *rk = nullptr, *dk = nullptr, *sk = nullptr, *Ax = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&rk, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&dk, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&sk, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&Ax, n_size * sizeof(cuDoubleComplex));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    cusparseDnVecDescr_t dvec_m, dvec_rk, dvec_dk, dvec_sk, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_64F);
    cusparseCreateDnVec(&dvec_rk, n_size, rk, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_64F);
    cusparseCreateDnVec(&dvec_sk, n_size, sk, CUDA_C_64F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_64F);
    cuDoubleComplex one, none;
    one.x = 1.0; one.y = 0.0;
    none.x = -1.0; none.y = 0.0;
    cuDoubleComplex ak, nak, d_old, betak, dkAx;
    Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
    // r0 = B - Ax
    cudaMemcpy(rk, d_B, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // r0 = B
    cublasZaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
 	Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_dk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
 	cuDoubleComplex d_new;
    cublasZdotu_v2(cub_handle, n_size, rk, 1, dk, 1, &d_new);
 	lcg_float rk_mod;
 	cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
    int ret, t = 0;
 	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = rk_mod/n_size;
 		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
        cublasZdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
 		ak = cuCdiv(d_new, dkAx);
        nak = cuCmul(none, ak);
        cublasZaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
        cublasZaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
        cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
        Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_sk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
 		d_old = d_new;
        cublasZdotu_v2(cub_handle, n_size, rk, 1, sk, 1, &d_new);
 		betak = cuCdiv(d_new, d_old);
        cublasZscal_v2(cub_handle, n_size, &betak, dk, 1);
        cublasZaxpy_v2(cub_handle, n_size, &one, sk, 1, dk, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(rk);
 		cudaFree(dk);
 		cudaFree(sk);
 		cudaFree(Ax);
        cusparseDestroyDnVec(dvec_m);
        cusparseDestroyDnVec(dvec_rk);
        cusparseDestroyDnVec(dvec_dk);
        cusparseDestroyDnVec(dvec_sk);
        cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
--- a/src/lib/clcg_cuda.h
+++ b/src/lib/clcg_cuda.h
@@ -0,0 +1,109 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _CLCG_CUDA_H
 #define _CLCG_CUDA_H
 #include "util.h"
 #include "lcg_complex_cuda.h"
 #ifdef LibLCG_CUDA
 #include <cublas_v2.h>
 #include <cusparse_v2.h>
 /**
 * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
 * by a vertical vector 'x'. Note that both A and x are hosted on the GPU device.
 * 
 * @param  instance    The user data sent for the lcg_solver_cuda() functions by the client.
 * @param  cub_handle  Handler of the cublas object.
 * @param  cus_handle  Handlee of the cusparse object.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 * @param  n_size      Size of x and column/row numbers of A.
 */
 typedef void (*clcg_axfunc_cuda_ptr)(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, cusparseOperation_t oper_t);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary. Note that m is hosted on the GPU device.
 * 
 * @param    instance    The user data sent for the lcg_solver() functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    n_size      The size of the variables
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*clcg_progress_cuda_ptr)(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
 	const clcg_para* param, const int n_size, const int nz_size, const int k);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_BICG.
 *
 * @return     Status of the function.
 */
 int clcg_solver_cuda(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, const cuDoubleComplex* B, 
    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
    cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_BICG);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int clcg_solver_preconditioned_cuda(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, 
    cuDoubleComplex* m, const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_PCG);
 #endif // LibLCG_CUDA
 #endif // _CLCG_CUDA_H
--- a/src/lib/clcg_cudaf.cu
+++ b/src/lib/clcg_cudaf.cu
@@ -0,0 +1,529 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "cmath"
 #include "ctime"
 #include "iostream"
 #include "clcg_cudaf.h"
 typedef int (*cuda_solver_ptr)(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clbicg(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clbicg_symmetric(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clcg_solver_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, const cuComplex* B, 
    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
    cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
 {
    cuda_solver_ptr cg_solver;
    switch (solver_id)
 	{
 		case CLCG_BICG:
 			cg_solver = clbicg;
 			break;
 		case CLCG_BICG_SYM:
 			cg_solver = clbicg_symmetric;
 			break;
 		default:
 			return CLCG_UNKNOWN_SOLVER;
 	}
 	return cg_solver(Afp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 typedef int (*cuda_precondtioned_solver_ptr)(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, 
    cuComplex* m, const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, 
    void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clpcg(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int clcg_solver_preconditioned_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, 
    cuComplex* m, const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
 {
    cuda_precondtioned_solver_ptr cgp_solver;
    switch (solver_id)
 	{
 		case CLCG_PCG:
 			cgp_solver = clpcg; break;
 		default:
 			return CLCG_UNKNOWN_SOLVER;
 	}
 	return cgp_solver(Afp, Mfp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 int clbicg(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
    // set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	cuComplex *d_m = nullptr, *d_B = nullptr;
 	cuComplex *r1k = nullptr, *r2k = nullptr;
 	cuComplex *d1k = nullptr, *d2k = nullptr, *Ax = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(cuComplex));
 	cudaMalloc(&d_B, n_size * sizeof(cuComplex));
    cudaMalloc(&r1k, n_size * sizeof(cuComplex));
 	cudaMalloc(&r2k, n_size * sizeof(cuComplex));
    cudaMalloc(&d1k, n_size * sizeof(cuComplex));
 	cudaMalloc(&d2k, n_size * sizeof(cuComplex));
    cudaMalloc(&Ax, n_size * sizeof(cuComplex));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
    cusparseDnVecDescr_t dvec_m, dvec_d1k, dvec_d2k, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_d1k, n_size, d1k, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_d2k, n_size, d2k, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_32F);
    cuComplex one, none;
    one.x = 1.0; one.y = 0.0;
    none.x = -1.0; none.y = 0.0;
 	cuComplex ak, nak, conj_ak, Ad1d2, r1r2_next, betak, conj_betak;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
    // r0 = B - Ax
    cudaMemcpy(r1k, d_B, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // r0 = B
    cublasCaxpy_v2(cub_handle, n_size, &none, Ax, 1, r1k, 1); // r0 -= Ax
    cudaMemcpy(d1k, r1k, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // d0 = r0
 	clcg_vecC_conjugate(r1k, r2k, n_size);
 	cudaMemcpy(d2k, r2k, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice);
 	cuComplex r1r2;
    cublasCdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2);
 	float rk_mod;
 	cublasScnrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
 	float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = rk_mod/n_size;
 		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_d1k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
        cublasCdotc_v2(cub_handle, n_size, d2k, 1, Ax, 1, &Ad1d2);
        ak = cuCdivf(r1r2, Ad1d2);
        nak = cuCmulf(none, ak);
 		conj_ak = cuConjf(nak);
        cublasCaxpy_v2(cub_handle, n_size, &ak, d1k, 1, d_m, 1);
        cublasCaxpy_v2(cub_handle, n_size, &nak, Ax, 1, r1k, 1);
        cublasScnrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
 		Afp(instance, cub_handle, cus_handle, dvec_d2k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE);
 		cublasCaxpy_v2(cub_handle, n_size, &conj_ak, Ax, 1, r2k, 1);
 		cublasCdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2_next);
 		betak = cuCdivf(r1r2_next, r1r2);
 		conj_betak = cuConjf(betak);
 		r1r2 = r1r2_next;
        cublasCscal_v2(cub_handle, n_size, &betak, d1k, 1);
        cublasCaxpy_v2(cub_handle, n_size, &one, r1k, 1, d1k, 1);
 		cublasCscal_v2(cub_handle, n_size, &conj_betak, d2k, 1);
        cublasCaxpy_v2(cub_handle, n_size, &one, r2k, 1, d2k, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(cuComplex), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(r1k);
 		cudaFree(r2k);
 		cudaFree(d1k);
 		cudaFree(d2k);	
 		cudaFree(Ax);
        cusparseDestroyDnVec(dvec_m);
        cusparseDestroyDnVec(dvec_d1k);
 		cusparseDestroyDnVec(dvec_d2k);
        cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
 int clbicg_symmetric(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
    // set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	cuComplex *d_m = nullptr, *d_B = nullptr;
 	cuComplex *rk = nullptr, *dk = nullptr, *Ax = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
    cudaMalloc(&rk, n_size * sizeof(cuComplex));
    cudaMalloc(&dk, n_size * sizeof(cuComplex));
    cudaMalloc(&Ax, n_size * sizeof(cuComplex));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
    cusparseDnVecDescr_t dvec_m, dvec_dk, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_32F);
    cuComplex one, none;
    one.x = 1.0; one.y = 0.0;
    none.x = -1.0; none.y = 0.0;
 	cuComplex ak, nak, rkrk2, betak, dkAx;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
    // r0 = B - Ax
    cudaMemcpy(rk, d_B, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // r0 = B
    cublasCaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
    cudaMemcpy(dk, rk, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // d0 = r0
 	cuComplex rkrk;
    cublasCdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk);
 	float rk_mod;
 	cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
 	float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = rk_mod/n_size;
 		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
        cublasCdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
        ak = cuCdivf(rkrk, dkAx);
        nak = cuCmulf(none, ak);
        cublasCaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
        cublasCaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
        cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
 		cublasCdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk2);
 		betak = cuCdivf(rkrk2, rkrk);
 		rkrk = rkrk2;
        cublasCscal_v2(cub_handle, n_size, &betak, dk, 1);
        cublasCaxpy_v2(cub_handle, n_size, &one, rk, 1, dk, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(cuComplex), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(rk);
 		cudaFree(dk);
 		cudaFree(Ax);
        cusparseDestroyDnVec(dvec_m);
        cusparseDestroyDnVec(dvec_dk);
        cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
 int clpcg(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
    // set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	if (m == nullptr) return CLCG_INVALID_POINTER;
 	if (B == nullptr) return CLCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	cuComplex *d_m = nullptr, *d_B = nullptr;
    cuComplex *rk = nullptr, *dk = nullptr, *sk = nullptr, *Ax = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(cuComplex));
    cudaMalloc(&d_B, n_size * sizeof(cuComplex));
    cudaMalloc(&rk, n_size * sizeof(cuComplex));
    cudaMalloc(&dk, n_size * sizeof(cuComplex));
    cudaMalloc(&sk, n_size * sizeof(cuComplex));
    cudaMalloc(&Ax, n_size * sizeof(cuComplex));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
    cusparseDnVecDescr_t dvec_m, dvec_rk, dvec_dk, dvec_sk, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_32F);
    cusparseCreateDnVec(&dvec_rk, n_size, rk, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_32F);
    cusparseCreateDnVec(&dvec_sk, n_size, sk, CUDA_C_32F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_32F);
    cuComplex one, none;
    one.x = 1.0; one.y = 0.0;
    none.x = -1.0; none.y = 0.0;
    cuComplex ak, nak, d_old, betak, dkAx;
    Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
    // r0 = B - Ax
    cudaMemcpy(rk, d_B, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // r0 = B
    cublasCaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
 	Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_dk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
 	cuComplex d_new;
    cublasCdotu_v2(cub_handle, n_size, rk, 1, dk, 1, &d_new);
 	float rk_mod;
 	cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
 	float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
    int ret, t = 0;
 	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = rk_mod/n_size;
 		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
        cublasCdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
 		ak = cuCdivf(d_new, dkAx);
        nak = cuCmulf(none, ak);
        cublasCaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
        cublasCaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
        cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
        Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_sk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
 		d_old = d_new;
        cublasCdotu_v2(cub_handle, n_size, rk, 1, sk, 1, &d_new);
 		betak = cuCdivf(d_new, d_old);
        cublasCscal_v2(cub_handle, n_size, &betak, dk, 1);
        cublasCaxpy_v2(cub_handle, n_size, &one, sk, 1, dk, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(cuComplex), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(rk);
 		cudaFree(dk);
 		cudaFree(sk);
 		cudaFree(Ax);
        cusparseDestroyDnVec(dvec_m);
        cusparseDestroyDnVec(dvec_rk);
        cusparseDestroyDnVec(dvec_dk);
        cusparseDestroyDnVec(dvec_sk);
        cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
--- a/src/lib/clcg_cudaf.h
+++ b/src/lib/clcg_cudaf.h
@@ -0,0 +1,109 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _CLCG_CUDA_FLOAT_H
 #define _CLCG_CUDA_FLOAT_H
 #include "util.h"
 #include "lcg_complex_cuda.h"
 #ifdef LibLCG_CUDA
 #include <cublas_v2.h>
 #include <cusparse_v2.h>
 /**
 * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
 * by a vertical vector 'x'. Note that both A and x are hosted on the GPU device.
 * 
 * @param  instance    The user data sent for the lcg_solver_cuda() functions by the client.
 * @param  cub_handle  Handler of the cublas object.
 * @param  cus_handle  Handlee of the cusparse object.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 * @param  n_size      Size of x and column/row numbers of A.
 */
 typedef void (*clcg_axfunc_cudaf_ptr)(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, cusparseOperation_t oper_t);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary. Note that m is hosted on the GPU device.
 * 
 * @param    instance    The user data sent for the lcg_solver() functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    n_size      The size of the variables
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*clcg_progress_cudaf_ptr)(void* instance, const cuComplex* m, const float converge, 
 	const clcg_para* param, const int n_size, const int nz_size, const int k);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_BICG.
 *
 * @return     Status of the function.
 */
 int clcg_solver_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, const cuComplex* B, 
    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
    cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_BICG);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int clcg_solver_preconditioned_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, 
    cuComplex* m, const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_PCG);
 #endif // LibLCG_CUDA
 #endif // _CLCG_CUDA_FLOAT_H
--- a/src/lib/clcg_eigen.cpp
+++ b/src/lib/clcg_eigen.cpp
@@ -0,0 +1,777 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "cmath"
 #include "ctime"
 #include "iostream"
 #include "clcg_eigen.h"
 #include "config.h"
 #ifdef LibLCG_OPENMP
 #include "omp.h"
 #endif
 typedef int (*eigen_solver_ptr)(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clbicg(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clbicg_symmetric(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clcgs(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int cltfqmr(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clcg_solver_eigen(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id)
 {
 	eigen_solver_ptr cg_solver;
 	switch (solver_id)
 	{
 		case CLCG_BICG:
 			cg_solver = clbicg;
 			break;
 		case CLCG_BICG_SYM:
 			cg_solver = clbicg_symmetric;
 			break;
 		case CLCG_CGS:
 			cg_solver = clcgs;
 			break;
 		case CLCG_TFQMR:
 			cg_solver = cltfqmr;
 			break;
 		default:
 			return CLCG_UNKNOWN_SOLVER;
 	}
 	return cg_solver(Afp, Pfp, m, B, param, instance);
 }
 typedef int (*eigen_preconditioned_solver_ptr)(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clpcg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clpbicg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
 int clcg_solver_preconditioned_eigen(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id)
 {
 	eigen_preconditioned_solver_ptr cgp_solver;
 	switch (solver_id)
 	{
 		case CLCG_PCG:
 			cgp_solver = clpcg; break;
 		case CLCG_PBICG:
 			cgp_solver = clpbicg; break;
 		default:
 			return CLCG_UNKNOWN_SOLVER;
 	}
 	return cgp_solver(Afp, Mfp, Pfp, m, B, param, instance);
 }
 int clbicg(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	int n_size = B.size();
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	std::complex<lcg_float> ak, Ad1d2, r1r2_next, betak;
 	Eigen::VectorXcd r1k(n_size), r2k(n_size), d1k(n_size), d2k(n_size);
 	Eigen::VectorXcd Ax(n_size);
 	Afp(instance, m, Ax, MatNormal, NonConjugate);
 	d1k = r1k = B - Ax;
 	d2k = r2k = r1k.conjugate();
 	// Eigen's dot is inner product
 	std::complex<lcg_float> r1r2 = r2k.dot(r1k);
 	lcg_float rk_mod = std::norm(r1k.dot(r1k));
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, &m, residual, &para, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, d1k, Ax, MatNormal, NonConjugate);
 		Ad1d2 = d2k.dot(Ax);
 		ak = r1r2/Ad1d2;
 		m = m + ak*d1k;
 		r1k = r1k - ak*Ax;
 		rk_mod = std::norm(r1k.dot(r1k));
 		Afp(instance, d2k, Ax, MatTranspose, Conjugate);
 		r2k = r2k - std::conj(ak)*Ax;
 		r1r2_next = r2k.dot(r1k);
 		betak = r1r2_next/r1r2;
 		r1r2 = r1r2_next;
 		d1k = r1k + betak*d1k;
 		d2k = r2k + std::conj(betak)*d2k;
 	}
 	func_ends:
 	{
 		r1k.resize(0);
 		r2k.resize(0);
 		d1k.resize(0);
 		d2k.resize(0);
 		Ax.resize(0);
 	}
 	return ret;
 }
 int clbicg_symmetric(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	int n_size = B.size();
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	std::complex<lcg_float> ak, rkrk2, betak, dkAx;
 	Eigen::VectorXcd rk(n_size), dk(n_size), Ax(n_size);
 	Afp(instance, m, Ax, MatNormal, NonConjugate);
 	dk = rk = (B - Ax);
 	std::complex<lcg_float> rkrk = rk.conjugate().dot(rk);
 	lcg_float rk_mod = std::norm(rk.dot(rk));
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, &m, residual, &para, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, dk, Ax, MatNormal, NonConjugate);
 		dkAx = dk.conjugate().dot(Ax);
 		ak = rkrk/dkAx;
 		m += ak*dk;
 		rk -= ak*Ax;
 		rk_mod = std::norm(rk.dot(rk));
 		rkrk2 = rk.conjugate().dot(rk);
 		betak = rkrk2/rkrk;
 		rkrk = rkrk2;
 		dk = rk + betak*dk;
 	}
 	func_ends:
 	{
 		rk.resize(0);
 		dk.resize(0);
 		Ax.resize(0);
 	}
 	return ret;
 }
 int clcgs(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	int n_size = B.size();
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	std::complex<lcg_float> ak, rhok2, sigma, betak, rkmod;
 	Eigen::VectorXcd rk(n_size), s0, pk(n_size);
 	Eigen::VectorXcd Ax(n_size), uk(n_size), qk(n_size), wk(n_size);
 	Afp(instance, m, Ax, MatNormal, NonConjugate);
 	pk = uk = rk = (B - Ax);
 	std::complex<lcg_float> rhok;
 	do
 	{
 		s0 = Eigen::VectorXcd::Random(n_size);
 		rhok = s0.conjugate().dot(rk);
 	} while (std::sqrt(std::norm(rhok)) < 1e-8);
 	lcg_float rk_mod = std::norm(rk.dot(rk));
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, &m, residual, &para, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, pk, Ax, MatNormal, NonConjugate);
 		sigma = s0.conjugate().dot(Ax);
 		ak = rhok/sigma;
 		qk = uk - ak*Ax;
 		wk = uk + qk;
 		Afp(instance, wk, Ax, MatNormal, NonConjugate);
 		m += ak*wk;
 		rk -= ak*Ax;
 		rk_mod = std::norm(rk.dot(rk));
 		rhok2 = s0.conjugate().dot(rk);
 		betak = rhok2/rhok;
 		rhok = rhok2;
 		uk = rk + betak*qk;
 		pk = uk + betak*(qk + betak*pk);
 	}
 	func_ends:
 	{
 		rk.resize(0);
 		s0.resize(0);
 		pk.resize(0);
 		Ax.resize(0);
 		uk.resize(0);
 		qk.resize(0);
 		wk.resize(0);
 	}
 	return ret;
 }
 int cltfqmr(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	int n_size = B.size();
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	int j;
 	Eigen::VectorXcd pk(n_size), uk(n_size), vk(n_size), dk(n_size);
 	Eigen::VectorXcd r0(n_size), rk(n_size), Ax(n_size), qk(n_size);
 	Eigen::VectorXcd uqk(n_size);
 	Afp(instance, m, Ax, MatNormal, NonConjugate);
 	pk = uk = r0 = rk = (B - Ax);
 	dk.setZero();
 	std::complex<lcg_float> rk_mod = rk.dot(rk);
 	lcg_float r0_mod = std::norm(rk_mod);
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	lcg_float theta = 0.0, omega = sqrt(rk_mod.real());
 	lcg_float residual, tao = omega;
 	std::complex<lcg_float> rk_mod2, sigma, alpha, betak, rho, rho2, sign, eta(0.0, 0.0);
 	rho = r0.dot(r0);
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(std::norm(rk_mod))/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, sqrt(std::norm(rk_mod))/n_size, &para, 0);
 		}
 		goto func_ends;
 	}	
 	else if (std::norm(rk_mod)/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, std::norm(rk_mod)/r0_mod, &para, 0);
 		}
 		goto func_ends;
 	}
 	while(1)
 	{
 		Afp(instance, pk, vk, MatNormal, NonConjugate);
 		sigma = r0.dot(vk);
 		alpha = rho/sigma;
 		qk = uk - alpha*vk;
 		uqk = uk + qk;
 		Afp(instance, uqk, Ax, MatNormal, NonConjugate);
 		rk -= alpha*Ax;
 		rk_mod2 = rk.dot(rk);
 		for (j = 1; j <= 2; j++)
 		{
 			if (para.abs_diff) residual = std::sqrt(std::norm(rk_mod))/n_size;
 			else residual = std::norm(rk_mod)/r0_mod;
 			if (Pfp != nullptr)
 			{
 				if (Pfp(instance, &m, residual, &para, t))
 				{
 					ret = CLCG_STOP; goto func_ends;
 				}
 			}
 			if (residual <= para.epsilon)
 			{
 				ret = CLCG_CONVERGENCE; goto func_ends;
 			}
 			if (para.max_iterations > 0 && t+1 > para.max_iterations)
 			{
 				ret = LCG_REACHED_MAX_ITERATIONS;
 				break;
 			}
 			t++;
 			sign = theta*theta*(eta/alpha);
 			if (j == 1)
 			{
 				omega = sqrt(sqrt(rk_mod.real())*sqrt(rk_mod2.real()));
 				dk = uk + sign*dk;
 			}
 			else
 			{
 				omega = sqrt(rk_mod2.real());
 				dk = qk + sign*dk;
 			}
 			theta = omega/tao;
 			tao = omega/sqrt(1.0+theta*theta);
 			eta = (1.0/(1.0+theta*theta))*alpha;
 			m += eta*dk;
 		}
 		rk_mod = rk_mod2;
 		rho2 = r0.dot(rk);
 		betak = rho2/rho;
 		rho = rho2;
 		uk = rk + betak*qk;
 		pk = uk + betak*(qk + betak*pk);
 	}
 	func_ends:
 	{
 		pk.resize(0);
 		uk.resize(0);
 		vk.resize(0);
 		dk.resize(0);
 		r0.resize(0);
 		rk.resize(0);
 		Ax.resize(0);
 		qk.resize(0);
 		uqk.resize(0);
 	}
 	return ret;
 }
 int clpcg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	int n_size = B.size();
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	Eigen::VectorXcd rk(n_size), dk(n_size), sk(n_size), Ax(n_size);
 	Afp(instance, m, Ax, MatNormal, NonConjugate);
 	rk = (B - Ax);
 	Mfp(instance, rk, dk, MatNormal, NonConjugate);
 	std::complex<lcg_float> ak, d_old, betak, dkAx;
 	std::complex<lcg_float> d_new = rk.conjugate().dot(dk);
 	lcg_float rk_mod = std::norm(rk.dot(rk));
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, &m, residual, &para, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, dk, Ax, MatNormal, NonConjugate);
 		dkAx = dk.conjugate().dot(Ax);
 		ak = d_new/dkAx;
 		m += ak*dk;
 		rk -= ak*Ax;
 		rk_mod = std::norm(rk.dot(rk));
 		Mfp(instance, rk, sk, MatNormal, NonConjugate);
 		d_old = d_new;
 		d_new = rk.conjugate().dot(sk);
 		betak = d_new/d_old;
 		dk = sk + betak*dk;
 	}
 	func_ends:
 	{
 		rk.resize(0);
 		dk.resize(0);
 		sk.resize(0);
 		Ax.resize(0);
 	}
 	return ret;
 }
 int clpbicg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
 {
 	// set CGS parameters
 	clcg_para para = (param != nullptr) ? (*param) : defparam2;
 	int n_size = B.size();
 	//check parameters
 	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
 	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
 	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
 	std::complex<lcg_float> ak, betak, pkAx, rhok2;
 	Eigen::VectorXcd rk(n_size), rsk(n_size), zk(n_size), pk(n_size), psk(n_size), Ax(n_size), Asx(n_size);
 	Afp(instance, m, Ax, MatNormal, NonConjugate);
 	rk = (B - Ax);
 	Mfp(instance, rk, zk, MatNormal, NonConjugate);
 	pk = zk;
 	rsk = rk.conjugate();
 	psk = pk.conjugate();
 	std::complex<lcg_float> rhok = rsk.dot(zk);
 	lcg_float rk_mod = std::norm(rk.dot(rk));
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
 		}
 		goto func_ends;
 	}	
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, &m, residual, &para, t))
 			{
 				ret = CLCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = CLCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		Afp(instance, pk, Ax, MatNormal, NonConjugate);
 		Afp(instance, psk, Asx, MatNormal, Conjugate);
 		pkAx = psk.dot(Ax);
 		ak = rhok/pkAx;
 		m += ak*pk;
 		rsk = rk.conjugate() - std::conj(ak)*Asx;
 		rk -= ak*Ax;
 		rk_mod = std::norm(rk.dot(rk));
 		Mfp(instance, rk, zk, MatNormal, NonConjugate);
 		rhok2 = rsk.dot(zk);
 		betak = rhok2/rhok;
 		rhok = rhok2;
 		pk = zk + betak*pk;
 		psk = zk.conjugate() + std::conj(betak)*psk;
 	}
 	func_ends:
 	{
 		rk.resize(0);
 		rsk.resize(0);
 		zk.resize(0);
 		pk.resize(0);
 		psk.resize(0);
 		Ax.resize(0);
 		Asx.resize(0);
 	}
 	return ret;
 }
--- a/src/lib/clcg_eigen.h
+++ b/src/lib/clcg_eigen.h
@@ -0,0 +1,94 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _CLCG_EIGEN_H
 #define _CLCG_EIGEN_H
 #include "util.h"
 #include "complex"
 #include "Eigen/Dense"
 /**
 * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
 * by a vertical vector 'x'.
 * 
 * @param  instance    The user data sent for the solver functions by the client.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 * @param  layout      layout information of the matrix A passed by the solver functions.
 * @param  conjugate   Layout information of the matrix A passed by the solver functions.
 */
 typedef void (*clcg_axfunc_eigen_ptr)(void* instance, const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
 	lcg_matrix_e layout, clcg_complex_e conjugate);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary.
 * 
 * @param    instance    The user data sent for the solver functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    param       The parameter object passed by the solver functions.
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*clcg_progress_eigen_ptr)(void* instance, const Eigen::VectorXcd *m, const lcg_float converge, 
 	const clcg_para *param, const int k);
 /**
 * @brief      A combined conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the solver function by the client. 
 * This variable is either 'this' for class member functions or 'nullptr' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is CLCG_CGS.
 *
 * @return     Status of the function.
 */
 int clcg_solver_eigen(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
 	const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id = CLCG_CGS);
 /**
 * @brief      A combined conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'M^{-1}x', in which M is the preconditioning matrix
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the solver function by the client. 
 * This variable is either 'this' for class member functions or 'nullptr' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. the value must CLCG_PBICG (default) or CLCG_PCG.
 *
 * @return     Status of the function.
 */
 int clcg_solver_preconditioned_eigen(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
    Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id = CLCG_PBICG);
 #endif // _CLCG_EIGEN_H
--- a/src/lib/lcg.cpp
+++ b/src/lib/lcg.cpp
--- a/src/lib/lcg.h
+++ b/src/lib/lcg.h
@@ -0,0 +1,171 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _LCG_H
 #define _LCG_H
 #include "util.h"
 /**
 * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
 * by a vertical vector 'x'.
 * 
 * @param  instance    The user data sent for the lcg_solver() functions by the client.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 * @param  n_size      Size of x and column/row numbers of A.
 */
 typedef void (*lcg_axfunc_ptr)(void* instance, const lcg_float* x, lcg_float* prod_Ax, 
 	const int n_size);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary.
 * 
 * @param    instance    The user data sent for the lcg_solver() functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    n_size      The size of the variables
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*lcg_progress_ptr)(void* instance, const lcg_float* m, const lcg_float converge, 
 	const lcg_para* param, const int n_size, const int k);
 /**
 * @brief      A combined conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int lcg_solver(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
 	const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_CGS);
 /**
 * @brief      A combined conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'M^{-1}x', in which M is the preconditioning matrix.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_PCG.
 *
 * @return     Status of the function.
 */
 int lcg_solver_preconditioned(lcg_axfunc_ptr Afp, lcg_axfunc_ptr Mfp, lcg_progress_ptr Pfp, lcg_float* m, 
 	const lcg_float* B, const int n_size, const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_PCG);
 /**
 * @brief      A combined conjugate gradient solver function with inequality constraints.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  low         The lower boundary of the acceptable solution.
 * @param[in]  hig         The higher boundary of the acceptable solution.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 * @param      P           Precondition vector (optional expect for the LCG_PCG method). The default value is NULL.
 *
 * @return     Status of the function.
 */
 int lcg_solver_constrained(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, 
 	const lcg_float* low, const lcg_float *hig, const int n_size, const lcg_para* param, 
 	void* instance, lcg_solver_enum solver_id = LCG_PG);
 /**
 * @brief      Standalone function of the Linear Conjugate Gradient algorithm
 * 
 * @note       To use the lcg() function for massive inversions, it is better to provide 
 * external vectors Gk, Dk and ADk to avoid allocating and destroying temporary vectors.
 *
 * @param[in]  Afp       Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp       Callback function for monitoring the iteration progress.
 * @param      m         Initial solution vector of the size n_size
 * @param[in]  B         Objective vector of the linear system.
 * @param[in]  n_size    Size of the solution vector and objective vector.
 * @param[in]  param     Parameter setup for the conjugate gradient methods.
 * @param      instance  The user data sent for the lcg() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      Gk        Conjugate gradient vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      Dk        Directional gradient vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      ADk       Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 *
 * @return     Status of the function.
 */
 int lcg(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
    const lcg_para* param, void* instance, lcg_float* Gk = nullptr, lcg_float* Dk = nullptr, 
    lcg_float* ADk = nullptr);
 /**
 * @brief      Standalone function of the Conjugate Gradient Squared algorithm.
 * 
 * @note       Algorithm 2 in "Generalized conjugate gradient method" by Fokkema et al. (1996).
 * 
 * @note       To use the lcgs() function for massive inversions, it is better to provide 
 * external vectors RK, R0T, PK, AX, UK, QK, and WK to avoid allocating and destroying temporary vectors.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'nullptr' for global functions.
 * @param      RK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      R0T         Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      PK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      AX          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      UK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      QK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 * @param      WK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
 *
 * @return     Status of the function.
 */
 int lcgs(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
    const lcg_para* param, void* instance, lcg_float* RK = nullptr, lcg_float* R0T = nullptr, 
    lcg_float* PK = nullptr, lcg_float* AX = nullptr, lcg_float* UK = nullptr, lcg_float* QK = nullptr, 
    lcg_float* WK = nullptr);
 #endif // _LCG_H
--- a/src/lib/lcg_complex.cpp
+++ b/src/lib/lcg_complex.cpp
@@ -0,0 +1,496 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "cmath"
 #include "ctime"
 #include "random"
 #include "lcg_complex.h"
 #ifdef LibLCG_OPENMP
 #include "omp.h"
 #endif
 lcg_complex* clcg_malloc(int n)
 {
 	lcg_complex *x = new lcg_complex [n];
 	return x;
 }
 lcg_complex** clcg_malloc(int m, int n)
 {
 	lcg_complex **x = new lcg_complex* [m];
 	for (int i = 0; i < m; i++)
 	{
 		x[i] = new lcg_complex [n];
 	}
 	return x;
 }
 void clcg_free(lcg_complex* x)
 {
 	if (x != nullptr)
 	{
 		delete[] x; x = nullptr;
 	}
 	return;
 }
 void clcg_free(lcg_complex **x, int m)
 {
 	if (x != nullptr)
 	{
 		for (int i = 0; i < m; i++)
 		{
 			delete[] x[i];
 		}
 		delete[] x;
 		x = nullptr;
 	}
 	return;
 }
 void clcg_vecset(lcg_complex *a, lcg_complex b, int size)
 {
 	for (int i = 0; i < size; i++)
 	{
 		a[i] = b;
 	}
 	return;
 }
 void clcg_vecset(lcg_complex **a, lcg_complex b, int m, int n)
 {
 	for (int i = 0; i < m; ++i)
 	{
 		for (int j = 0; j < n; ++j)
 		{
 			a[i][j] = b;
 		}
 	}
 	return;
 }
 #ifdef LibLCG_STD_COMPLEX
 void clcg_set(lcg_complex *a, lcg_float r, lcg_float i)
 {
 	a->real(r);
 	a->imag(i);
 	return;
 }
 lcg_float clcg_square(const lcg_complex *a)
 {
 	return std::norm(*a);
 }
 lcg_float clcg_module(const lcg_complex *a)
 {
 	return sqrt(std::norm(*a));
 }
 lcg_complex clcg_conjugate(const lcg_complex *a)
 {
 	lcg_complex b = std::conj(*a);
 	return b;
 }
 void clcg_vecrnd(lcg_complex *a, lcg_complex l, lcg_complex h, int size)
 {
 	srand(time(0));
 	for (int i = 0; i < size; i++)
 	{
 		a[i].real((h.real()-l.real())*rand()*1.0/RAND_MAX + l.real());
 		a[i].imag((h.imag()-l.imag())*rand()*1.0/RAND_MAX + l.imag());
 	}
 	return;
 }
 void clcg_vecrnd(lcg_complex **a, lcg_complex l, lcg_complex h, int m, int n)
 {
 	srand(time(0));
 	for (int i = 0; i < m; i++)
 	{
 		for (int j = 0; j < n; j++)
 		{
 			a[i][j].real((h.real()-l.real())*rand()*1.0/RAND_MAX + l.real());
 			a[i][j].imag((h.imag()-l.imag())*rand()*1.0/RAND_MAX + l.imag());	
 		}
 	}
 	return;
 }
 void clcg_dot(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
 {
 	lcg_float re = 0.0, im = 0.0;
 	// <a,b> = \sum{a_i \cdot b_i}
 	for (int i = 0; i < size; i++)
 	{
 		re += (a[i].real()*b[i].real() - a[i].imag()*b[i].imag());
 		im += (a[i].real()*b[i].imag() + a[i].imag()*b[i].real());
 	}
 	ret.real(re); ret.imag(im);
 	return;
 }
 void clcg_inner(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
 {
 	lcg_float re = 0.0, im = 0.0;
 	// <a,b> = \sum{\bar{a_i} \cdot b_i}
 	for (int i = 0; i < size; i++)
 	{
 		re += (a[i].real()*b[i].real() + a[i].imag()*b[i].imag());
 		im += (a[i].real()*b[i].imag() - a[i].imag()*b[i].real());
 	}
 	ret.real(re); ret.imag(im);
 	return;
 }
 void clcg_matvec(lcg_complex **A, const lcg_complex *x, lcg_complex *Ax, 
 	int m_size, int n_size, lcg_matrix_e layout, clcg_complex_e conjugate)
 {
 	size_t i, j;
 	lcg_float re, im;
 	if (conjugate == Conjugate)
 	{
 		if (layout == MatNormal)
 		{
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 			for (i = 0; i < m_size; i++)
 			{
 				re = 0.0; im = 0.0;
 				for (j = 0; j < n_size; j++)
 				{
 					re += (A[i][j].real()*x[j].real() + A[i][j].imag()*x[j].imag());
 					im += (A[i][j].real()*x[j].imag() - A[i][j].imag()*x[j].real());
 				}
 				Ax[i].real(re); Ax[i].imag(im);
 			}
 			return;
 		}
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 		for (j = 0; j < n_size; j++)
 		{
 			re = 0.0; im = 0.0;
 			for (i = 0; i < m_size; i++)
 			{
 				re += (A[i][j].real()*x[i].real() + A[i][j].imag()*x[i].imag());
 				im += (A[i][j].real()*x[i].imag() - A[i][j].imag()*x[i].real());
 			}
 			Ax[j].real(re); Ax[j].imag(im);
 		}
 		return;
 	}
 	if (layout == MatNormal)
 	{
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 		for (i = 0; i < m_size; i++)
 		{
 			re = 0.0; im = 0.0;
 			for (j = 0; j < n_size; j++)
 			{
 				re += (A[i][j].real()*x[j].real() - A[i][j].imag()*x[j].imag());
 				im += (A[i][j].real()*x[j].imag() + A[i][j].imag()*x[j].real());
 			}
 			Ax[i].real(re); Ax[i].imag(im);
 		}
 		return;
 	}
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 	for (j = 0; j < n_size; j++)
 	{
 		re = 0.0; im = 0.0;
 		for (i = 0; i < m_size; i++)
 		{
 			re += (A[i][j].real()*x[i].real() - A[i][j].imag()*x[i].imag());
 			im += (A[i][j].real()*x[i].imag() + A[i][j].imag()*x[i].real());
 		}
 		Ax[j].real(re); Ax[j].imag(im);
 	}
 	return;
 }
 #else
 lcg_complex::lcg_complex()
 {
 	rel = img = 0.0;
 }
 lcg_complex::lcg_complex(lcg_float r, lcg_float i)
 {
 	rel = r; img = i;
 }
 lcg_complex::~lcg_complex(){}
 void lcg_complex::real(lcg_float a)
 {
 	rel = a;
 	return;
 }
 void lcg_complex::imag(lcg_float a)
 {
 	img = a;
 	return;
 }
 lcg_float lcg_complex::real()
 {
 	return rel;
 }
 lcg_float lcg_complex::imag()
 {
 	return img;
 }
 bool operator==(const lcg_complex &a, const lcg_complex &b)
 {
 	if (a.rel == b.rel && a.img == b.img)
 		return true;
 	return false;
 }
 bool operator!=(const lcg_complex &a, const lcg_complex &b)
 {
 	if (a.rel != b.rel || a.img != b.img)
 		return true;
 	return false;
 }
 lcg_complex operator+(const lcg_complex &a, const lcg_complex &b)
 {
 	lcg_complex ret;
 	ret.rel = a.rel + b.rel;
 	ret.img = a.img + b.img;
 	return ret;
 }
 lcg_complex operator-(const lcg_complex &a, const lcg_complex &b)
 {
 	lcg_complex ret;
 	ret.rel = a.rel - b.rel;
 	ret.img = a.img - b.img;
 	return ret;
 }
 lcg_complex operator*(const lcg_complex &a, const lcg_complex &b)
 {
 	lcg_complex ret;
 	ret.rel = a.rel*b.rel - a.img*b.img;
 	ret.img = a.rel*b.img + a.img*b.rel;
 	return ret;
 }
 lcg_complex operator*(const lcg_float &a, const lcg_complex &b)
 {
 	lcg_complex ret;
 	ret.rel = a*b.rel;
 	ret.img = a*b.img;
 	return ret;
 }
 lcg_complex operator/(const lcg_complex &a, const lcg_complex &b)
 {
 	lcg_complex ret;
 	if (b.rel == 0 && b.img == 0)
 	{
 		ret.rel = ret.img = NAN;
 		return ret;
 	}
 	ret.rel = (a.rel*b.rel + a.img*b.img)/(b.rel*b.rel + b.img*b.img);
 	ret.img = (a.img*b.rel - a.rel*b.img)/(b.rel*b.rel + b.img*b.img);
 	return ret;
 }
 lcg_complex operator/(const lcg_float &a, const lcg_complex &b)
 {
 	lcg_complex ret;
 	if (b.rel == 0 && b.img == 0)
 	{
 		ret.rel = ret.img = NAN;
 		return ret;
 	}
 	ret.rel = a*b.rel/(b.rel*b.rel + b.img*b.img);
 	ret.img = -1.0*a*b.img/(b.rel*b.rel + b.img*b.img);
 	return ret;
 }
 std::ostream &operator<<(std::ostream &os, const lcg_complex &a)
 {
 	if (a.img >= 0)
 		os << a.rel << "+" << a.img << "i";
 	else
 		os << a.rel << a.img << "i";
 	return os;
 }
 void clcg_set(lcg_complex *a, lcg_float r, lcg_float i)
 {
 	a->rel = r;
 	a->img = i;
 	return;
 }
 lcg_float clcg_square(const lcg_complex *a)
 {
 	return a->rel * a->rel + a->img * a->img;
 }
 lcg_float clcg_module(const lcg_complex *a)
 {
 	return sqrt(clcg_square(a));
 }
 lcg_complex clcg_conjugate(const lcg_complex *a)
 {
 	lcg_complex b;
 	b.rel = a->rel;
 	b.img = -1.0 * a->img;
 	return b;
 }
 void clcg_vecrnd(lcg_complex *a, lcg_complex l, lcg_complex h, int size)
 {
 	srand(time(nullptr));
 	for (int i = 0; i < size; i++)
 	{
 		a[i].rel = (h.rel-l.rel)*rand()*1.0/RAND_MAX + l.rel;
 		a[i].img = (h.img-l.img)*rand()*1.0/RAND_MAX + l.img;
 	}
 	return;
 }
 void clcg_vecrnd(lcg_complex **a, lcg_complex l, lcg_complex h, int m, int n)
 {
 	srand(time(nullptr));
 	for (int i = 0; i < m; i++)
 	{
 		for (int j = 0; j < n; j++)
 		{
 			a[i][j].rel = (h.rel-l.rel)*rand()*1.0/RAND_MAX + l.rel;
 			a[i][j].img = (h.img-l.img)*rand()*1.0/RAND_MAX + l.img;	
 		}
 	}
 	return;
 }
 void clcg_dot(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
 {
 	clcg_set(&ret, 0.0, 0.0);
 	// <a,b> = \sum{a_i \cdot b_i}
 	for (int i = 0; i < size; i++)
 	{
 		ret.rel += (a[i].rel*b[i].rel - a[i].img*b[i].img);
 		ret.img += (a[i].rel*b[i].img + a[i].img*b[i].rel);
 	}
 	return;
 }
 void clcg_inner(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
 {
 	clcg_set(&ret, 0.0, 0.0);
 	// <a,b> = \sum{\bar{a_i} \cdot b_i}
 	for (int i = 0; i < size; i++)
 	{
 		ret.rel += (a[i].rel*b[i].rel + a[i].img*b[i].img);
 		ret.img += (a[i].rel*b[i].img - a[i].img*b[i].rel);
 	}
 	return;
 }
 void clcg_matvec(lcg_complex **A, const lcg_complex *x, lcg_complex *Ax, 
 	int m_size, int n_size, lcg_matrix_e layout, clcg_complex_e conjugate)
 {
 	int i, j;
 	lcg_float re, im;
 	if (conjugate == Conjugate)
 	{
 		if (layout == MatNormal)
 		{
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 			for (i = 0; i < m_size; i++)
 			{
 				re = 0.0; im = 0.0;
 				for (j = 0; j < n_size; j++)
 				{
 					re += (A[i][j].rel*x[j].rel + A[i][j].img*x[j].img);
 					im += (A[i][j].rel*x[j].img - A[i][j].img*x[j].rel);
 				}
 				clcg_set(&Ax[i], re, im);
 			}
 			return;
 		}
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 		for (j = 0; j < n_size; j++)
 		{
 			re = 0.0; im = 0.0;
 			for (i = 0; i < m_size; i++)
 			{
 				re += (A[i][j].rel*x[i].rel + A[i][j].img*x[i].img);
 				im += (A[i][j].rel*x[i].img - A[i][j].img*x[i].rel);
 			}
 			clcg_set(&Ax[j], re, im);
 		}
 		return;
 	}
 	if (layout == MatNormal)
 	{
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 		for (i = 0; i < m_size; i++)
 		{
 			re = 0.0; im = 0.0;
 			for (j = 0; j < n_size; j++)
 			{
 				re += (A[i][j].rel*x[j].rel - A[i][j].img*x[j].img);
 				im += (A[i][j].rel*x[j].img + A[i][j].img*x[j].rel);
 			}
 			clcg_set(&Ax[i], re, im);
 		}
 		return;
 	}
 #pragma omp parallel for private (i, j, re, im) schedule(guided)
 	for (j = 0; j < n_size; j++)
 	{
 		re = 0.0; im = 0.0;
 		for (i = 0; i < m_size; i++)
 		{
 			re+= (A[i][j].rel*x[i].rel - A[i][j].img*x[i].img);
 			im += (A[i][j].rel*x[i].img + A[i][j].img*x[i].rel);
 		}
 		clcg_set(&Ax[j], re, im);
 	}
 	return;
 }
 #endif // LibLCG_SYSTEM_COMPLEX
--- a/src/lib/lcg_complex.h
+++ b/src/lib/lcg_complex.h
@@ -0,0 +1,329 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _LCG_COMPLEX_H
 #define _LCG_COMPLEX_H
 #include "iostream"
 #include "algebra.h"
 #ifdef LibLCG_STD_COMPLEX
 #include "complex"
 typedef std::complex<lcg_float> lcg_complex;
 #else
 /**
 * @brief     A simple definition of the complex number type. 
 * Easy to change in the future. Right now it is just two double variables
 */
 struct lcg_complex
 {
 	lcg_float rel; ///< The real part
 	lcg_float img; ///< The imaginary part
 	/**
 	 * @brief      Constructs a new instance.
 	 */
 	lcg_complex();
 	/**
 	 * @brief      Constructs a new instance.
 	 *
 	 * @param[in]  r     The real part of the complex number
 	 * @param[in]  i     The imaginary part of the complex number
 	 */
 	lcg_complex(lcg_float r, lcg_float i);
 	/**
 	 * @brief      Destructor
 	 */
 	virtual ~lcg_complex();
 	/**
 	 * @brief      Set real part of a complex number
 	 * 
 	 * @param a    Input value
 	 */
 	void real(lcg_float a);
 	/**
 	 * @brief     Set image part of a complex number
 	 * 
 	 * @param a   Input value
 	 */
 	void imag(lcg_float a);
 	/**
 	 * @brief    Get real part of a complex number
 	 * 
 	 * @return lcg_float Real component
 	 */
 	lcg_float real();
 	/**
 	 * @brief    Get image part of a complex number
 	 * 
 	 * @return lcg_float Image component
 	 */
 	lcg_float imag();
 };
 /**
 * @brief      Reload equality operator.
 *
 * @param[in]  a     complex number a
 * @param[in]  b     complex number b
 *
 * @return     equal or not
 */
 bool operator==(const lcg_complex &a, const lcg_complex &b);
 /**
 * @brief      Reload inequality operator.
 *
 * @param[in]  a     complex number a
 * @param[in]  b     complex number b
 *
 * @return     unequal or not
 */
 bool operator!=(const lcg_complex &a, const lcg_complex &b);
 /**
 * @brief      Reload addition operator.
 *
 * @param[in]  a     complex number a
 * @param[in]  b     complex number b
 *
 * @return     sum
 */
 lcg_complex operator+(const lcg_complex &a, const lcg_complex &b);
 /**
 * @brief      Reload subtraction operator.
 *
 * @param[in]  a     complex number a
 * @param[in]  b     complex number b
 *
 * @return     subtraction
 */
 lcg_complex operator-(const lcg_complex &a, const lcg_complex &b);
 /**
 * @brief      Reload multiplication operator.
 *
 * @param[in]  a     complex number a
 * @param[in]  b     complex number b
 *
 * @return     product
 */
 lcg_complex operator*(const lcg_complex &a, const lcg_complex &b);
 /**
 * @brief      Reload multiplication operator.
 *
 * @param[in]  a     real number a
 * @param[in]  b     complex number b
 *
 * @return     product
 */
 lcg_complex operator*(const lcg_float &a, const lcg_complex &b);
 /**
 * @brief      Reload division operator.
 *
 * @param[in]  a     complex number a
 * @param[in]  b     complex number b
 *
 * @return     quotient
 */
 lcg_complex operator/(const lcg_complex &a, const lcg_complex &b);
 /**
 * @brief      Reload division operator.
 *
 * @param[in]  a     real number a
 * @param[in]  b     complex number b
 *
 * @return     quotient
 */
 lcg_complex operator/(const lcg_float &a, const lcg_complex &b);
 /**
 * @brief      Reload ostream operator.
 *
 * @param      os    The ostream
 * @param[in]  a     complex number a
 *
 * @return     The ostream
 */
 std::ostream &operator<<(std::ostream &os, const lcg_complex &a);
 #endif // LibLCG_STD_COMPLEX
 /**
 * @brief      Locate memory for a lcg_complex pointer type.
 *
 * @param[in]  n     Size of the lcg_float array.
 *
 * @return     Pointer of the array's location.
 */
 lcg_complex* clcg_malloc(int n);
 /**
 * @brief      Locate memory for a lcg_complex second pointer type.
 *
 * @param[in]  n     Size of the lcg_float array.
 *
 * @return     Pointer of the array's location.
 */
 lcg_complex** clcg_malloc(int m, int n);
 /**
 * @brief      Destroy memory used by the lcg_complex type array.
 *
 * @param      x     Pointer of the array.
 */
 void clcg_free(lcg_complex* x);
 /**
 * @brief      Destroy memory used by the 2D lcg_complex type array.
 *
 * @param      x     Pointer of the array.
 */
 void clcg_free(lcg_complex **x, int m);
 /**
 * @brief      set a complex vector's value
 *
 * @param      a     pointer of the vector
 * @param[in]  b     initial value
 * @param[in]  size  vector size
 */
 void clcg_vecset(lcg_complex *a, lcg_complex b, int size);
 /**
 * @brief      set a 2d complex vector's value
 *
 * @param      a     pointer of the matrix
 * @param[in]  b     initial value
 * @param[in]  m     row size of the matrix
 * @param[in]  n     column size of the matrix
 */
 void clcg_vecset(lcg_complex **a, lcg_complex b, int m, int n);
 /**
 * @brief      setup a complex number
 *
 * @param[in]  r     The real part of the complex number
 * @param[in]  i     The imaginary part of the complex number
 */
 void clcg_set(lcg_complex *a, lcg_float r, lcg_float i);
 /**
 * @brief      Calculate the squared module of a complex number
 *
 * @return     The module
 */
 lcg_float clcg_square(const lcg_complex *a);
 /**
 * @brief      Calculate the module of a complex number
 *
 * @return     The module
 */
 lcg_float clcg_module(const lcg_complex *a);
 /**
 * @brief      Calculate the conjugate of a complex number
 *
 * @return     The complex conjugate.
 */
 lcg_complex clcg_conjugate(const lcg_complex *a);
 /**
 * @brief      set a complex vector using random values
 *
 * @param      a     pointer of the vector
 * @param[in]  l     the lower bound of random values
 * @param[in]  h     the higher bound of random values
 * @param[in]  size  size of the vector
 */
 void clcg_vecrnd(lcg_complex *a, lcg_complex l, lcg_complex h, int size);
 /**
 * @brief      set a 2D complex vector using random values
 *
 * @param      a     pointer of the vector
 * @param[in]  l     the lower bound of random values
 * @param[in]  h     the higher bound of random values
 * @param[in]  m     row size of the vector
 * @param[in]  n     column size of the vector
 */
 void clcg_vecrnd(lcg_complex **a, lcg_complex l, lcg_complex h, int m, int n);
 /**
 * @brief      calculate dot product of two complex vectors
 * 
 * the product of two complex vectors are defined as <a, b> = \sum{a_i \cdot b_i}
 *
 * @param[in]  a       complex vector a
 * @param[in]  b       complex vector b
 * @param[in]  x_size  size of the vector
 *
 * @return     product
 */
 void clcg_dot(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size);
 /**
 * @brief      calculate inner product of two complex vectors
 * 
 * the product of two complex vectors are defined as <a, b> = \sum{\bar{a_i} \cdot b_i}
 *
 * @param[in]  a       complex vector a
 * @param[in]  b       complex vector b
 * @param[in]  x_size  size of the vector
 *
 * @return     product
 */
 void clcg_inner(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size);
 /**
 * @brief      calculate product of a complex matrix and a complex vector
 * 
 * the product of two complex vectors are defined as <a, b> = \sum{\bar{a_i}\cdot\b_i}.
 * Different configurations:
 * layout=Normal,conjugate=false -> A
 * layout=Transpose,conjugate=false -> A^T
 * layout=Normal,conjugate=true -> \bar{A}
 * layout=Transpose,conjugate=true -> A^H
 *
 * @param      A          complex matrix A
 * @param[in]  x          complex vector x
 * @param      Ax         product of Ax
 * @param[in]  m_size     row size of A
 * @param[in]  n_size     column size of A
 * @param[in]  layout     layout of A used for multiplication. Must be Normal or Transpose
 * @param[in]  conjugate  whether to use the complex conjugate of A for calculation
 */
 void clcg_matvec(lcg_complex **A, const lcg_complex *x, lcg_complex *Ax, int m_size, int n_size, 
 	lcg_matrix_e layout = MatNormal, clcg_complex_e conjugate = NonConjugate);
 #endif // _LCG_COMPLEX_H
--- a/src/lib/lcg_complex_cuda.cu
+++ b/src/lib/lcg_complex_cuda.cu
@@ -0,0 +1,356 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "lcg_complex_cuda.h"
 #include "complex"
 #include "map"
 __global__ void smCcsr_get_diagonal_device(const int *A_row, const int *A_col, const cuComplex *A_val, const int A_len, cuComplex *A_diag)
 {
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < A_len)
 	{
 		const int num_non0_row = A_row[i + 1] - A_row[i];
 		for (int j = 0; j < num_non0_row; j++)
 		{
 			if (A_col[j + A_row[i]] == i)
 			{
 				A_diag[i] = A_val[j + A_row[i]];
 				break;
 			}
 		}
 	}
 	return;
 }
 __global__ void smZcsr_get_diagonal_device(const int *A_row, const int *A_col, const cuDoubleComplex *A_val, const int A_len, cuDoubleComplex *A_diag)
 {
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < A_len)
 	{
 		const int num_non0_row = A_row[i + 1] - A_row[i];
 		for (int j = 0; j < num_non0_row; j++)
 		{
 			if (A_col[j + A_row[i]] == i)
 			{
 				A_diag[i] = A_val[j + A_row[i]];
 				break;
 			}
 		}
 	}
 	return;
 }
 __global__ void vecMvecC_element_wise_device(const cuComplex *a, const cuComplex *b, cuComplex *c, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		c[i] = cuCmulf(a[i], b[i]);
 	}
 	return;
 }
 __global__ void vecMvecZ_element_wise_device(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		c[i] = cuCmul(a[i], b[i]);
 	}
 	return;
 }
 __global__ void vecDvecC_element_wise_device(const cuComplex *a, const cuComplex *b, cuComplex *c, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		c[i] = cuCdivf(a[i], b[i]);
 	}
 	return;
 }
 __global__ void vecDvecZ_element_wise_device(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		c[i] = cuCdiv(a[i], b[i]);
 	}
 	return;
 }
 __global__ void vecC_conjugate_device(const cuComplex *a, cuComplex *ca, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		ca[i] = a[i];
 		ca[i].y *= -1.0;
 	}
 	return;
 }
 __global__ void vecZ_conjugate_device(const cuDoubleComplex *a, cuDoubleComplex *ca, int n)
 {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < n)
 	{
 		ca[i] = a[i];
 		ca[i].y *= -1.0;
 	}
 	return;
 }
 lcg_complex cuda2lcg_complex(cuDoubleComplex a)
 {
 	return lcg_complex(a.x, a.y);
 }
 #ifdef LibLCG_STD_COMPLEX
 cuDoubleComplex lcg2cuda_complex(lcg_complex a)
 {
 	cuDoubleComplex o;
 	o.x = a.real(); o.y = a.imag();
 	return o;
 }
 #else
 cuDoubleComplex lcg2cuda_complex(lcg_complex a)
 {
 	cuDoubleComplex o;
 	o.x = a.rel(); o.y = a.img();
 	return o;
 }
 #endif // LibLCG_STD_COMPLEX
 cuDoubleComplex* clcg_malloc_cuda(size_t n)
 {
 	cuDoubleComplex *x = new cuDoubleComplex [n];
 	return x;
 }
 void clcg_free_cuda(cuDoubleComplex *x)
 {
 	if (x != nullptr)
 	{
 		delete[] x; x = nullptr;
 	}
 	return;
 }
 void clcg_vecset_cuda(cuDoubleComplex *a, cuDoubleComplex b, size_t size)
 {
 	for (size_t i = 0; i < size; i++)
 	{
 		a[i].x = b.x; a[i].y = b.y;
 	}
 	return;
 }
 cuComplex clcg_Cscale(float s, cuComplex a)
 {
 	cuComplex o;
 	o.x = s*a.x;
 	o.y = s*a.y;
 	return o;
 }
 cuComplex clcg_Csum(cuComplex a, cuComplex b)
 {
 	cuComplex o;
 	o.x = a.x + b.x;
 	o.y = a.y + b.y;
 	return o;
 }
 cuComplex clcg_Cdiff(cuComplex a, cuComplex b)
 {
 	cuComplex o;
 	o.x = a.x - b.x;
 	o.y = a.y - b.y;
 	return o;
 }
 cuComplex clcg_Csqrt(cuComplex a)
 {
 	std::complex<float> c = std::sqrt(std::complex<float>(a.x, a.y));
 	cuComplex s;
 	s.x = c.real(); s.y = c.imag();
 	return s;
 }
 cuDoubleComplex clcg_Zscale(lcg_float s, cuDoubleComplex a)
 {
 	cuDoubleComplex o;
 	o.x = s*a.x;
 	o.y = s*a.y;
 	return o;
 }
 cuDoubleComplex clcg_Zsum(cuDoubleComplex a, cuDoubleComplex b)
 {
 	cuDoubleComplex o;
 	o.x = a.x + b.x;
 	o.y = a.y + b.y;
 	return o;
 }
 cuDoubleComplex clcg_Zdiff(cuDoubleComplex a, cuDoubleComplex b)
 {
 	cuDoubleComplex o;
 	o.x = a.x - b.x;
 	o.y = a.y - b.y;
 	return o;
 }
 cuDoubleComplex clcg_Zsqrt(cuDoubleComplex a)
 {
 	std::complex<lcg_float> c = std::sqrt(std::complex<lcg_float>(a.x, a.y));
 	cuDoubleComplex s;
 	s.x = c.real(); s.y = c.imag();
 	return s;
 }
 void clcg_smCcoo_row2col(const int *A_row, const int *A_col, const cuComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuComplex *Ac_val)
 {
 	size_t i, order;
 	std::map<size_t, cuComplex> sort_map;
 	std::map<size_t, cuComplex>::iterator st_iter;
 	for (i = 0; i < nz; i++)
 	{
 		order = N*A_col[i] + A_row[i];
 		sort_map[order] = A[i];
 	}
 	i = 0;
 	for (st_iter = sort_map.begin(); st_iter != sort_map.end(); st_iter++)
 	{
 		order = st_iter->first;
 		// exchange the row and column indice to rotate the matrix
 		Ac_row[i] = order/N;
 		Ac_col[i] = order%N;
 		Ac_val[i] = st_iter->second;
 		i++;
 	}
 	sort_map.clear();
 	return;
 }
 void clcg_smZcoo_row2col(const int *A_row, const int *A_col, const cuDoubleComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuDoubleComplex *Ac_val)
 {
 	size_t i, order;
 	std::map<size_t, cuDoubleComplex> sort_map;
 	std::map<size_t, cuDoubleComplex>::iterator st_iter;
 	for (i = 0; i < nz; i++)
 	{
 		order = N*A_col[i] + A_row[i];
 		sort_map[order] = A[i];
 	}
 	i = 0;
 	for (st_iter = sort_map.begin(); st_iter != sort_map.end(); st_iter++)
 	{
 		order = st_iter->first;
 		// exchange the row and column indice to rotate the matrix
 		Ac_row[i] = order/N;
 		Ac_col[i] = order%N;
 		Ac_val[i] = st_iter->second;
 		i++;
 	}
 	sort_map.clear();
 	return;
 }
 void clcg_smCcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuComplex *A_val, const int A_len, cuComplex *A_diag, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (A_len + blockSize - 1) / blockSize;
 	smCcsr_get_diagonal_device<<<numBlocks, blockSize>>>(A_ptr, A_col, A_val, A_len, A_diag);
 	return;
 }
 void clcg_smZcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuDoubleComplex *A_val, const int A_len, cuDoubleComplex *A_diag, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (A_len + blockSize - 1) / blockSize;
 	smZcsr_get_diagonal_device<<<numBlocks, blockSize>>>(A_ptr, A_col, A_val, A_len, A_diag);
 	return;
 }
 void clcg_vecMvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	vecMvecC_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
 	return;
 }
 void clcg_vecMvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	vecMvecZ_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
 	return;
 }
 void clcg_vecDvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	vecDvecC_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
 	return;
 }
 void clcg_vecDvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	vecDvecZ_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
 	return;
 }
 void clcg_vecC_conjugate(const cuComplex *a, cuComplex *ca, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	vecC_conjugate_device<<<numBlocks, blockSize>>>(a, ca, n);
 	return;
 }
 void clcg_vecZ_conjugate(const cuDoubleComplex *a, cuDoubleComplex *ca, int n, int bk_size)
 {
 	int blockSize = bk_size;
 	int numBlocks = (n + blockSize - 1) / blockSize;
 	vecZ_conjugate_device<<<numBlocks, blockSize>>>(a, ca, n);
 	return;
 }
--- a/src/lib/lcg_complex_cuda.h
+++ b/src/lib/lcg_complex_cuda.h
@@ -0,0 +1,278 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _LCG_COMPLEX_CUDA_H
 #define _LCG_COMPLEX_CUDA_H
 #include "lcg_complex.h"
 #ifdef LibLCG_CUDA
 #include <cuda_runtime.h>
 #include <cuComplex.h>
 /**
 * @brief  Convert cuda complex number to lcg complex number
 * 
 * @param a CUDA complex number
 * @return lcg_complex  lcg complex number
 */
 lcg_complex cuda2lcg_complex(cuDoubleComplex a);
 /**
 * @brief Convert lcg complex number to CUDA complex number
 * 
 * @param a lcg complex number
 * @return cuDoubleComplex CUDA complex number
 */
 cuDoubleComplex lcg2cuda_complex(lcg_complex a);
 /**
 * @brief      Locate memory for a cuDoubleComplex pointer type.
 *
 * @param[in]  n     Size of the lcg_float array.
 *
 * @return     Pointer of the array's location.
 */
 cuDoubleComplex* clcg_malloc_cuda(size_t n);
 /**
 * @brief      Destroy memory used by the cuDoubleComplex type array.
 *
 * @param      x     Pointer of the array.
 */
 void clcg_free_cuda(cuDoubleComplex *x);
 /**
 * @brief      set a complex vector's value
 *
 * @param      a     pointer of the vector
 * @param[in]  b     initial value
 * @param[in]  size  vector size
 */
 void clcg_vecset_cuda(cuDoubleComplex *a, cuDoubleComplex b, size_t size);
 /**
 * @brief    Host side function for scale a cuDoubleComplex object
 * 
 * @param s  scale factor
 * @param a  Complex number
 * @return cuComplex  scaled complex number
 */
 cuComplex clcg_Cscale(lcg_float s, cuComplex a);
 /**
 * @brief   Calculate the sum of two cuda complex number. This is a host side function.
 * 
 * @param a Complex number
 * @param b Complex number
 * @return cuComplex Sum of the input complex number 
 */
 cuComplex clcg_Csum(cuComplex a, cuComplex b);
 /**
 * @brief   Calculate the difference of two cuda complex number. This is a host side function.
 * 
 * @param a Complex number
 * @param b Complex number
 * @return cuComplex Difference of the input complex number 
 */
 cuComplex clcg_Cdiff(cuComplex a, cuComplex b);
 /**
 * @brief   Calculate the sqrt() of a cuda complex number
 * 
 * @param a Complex number
 * @return cuComplex root value
 */
 cuComplex clcg_Csqrt(cuComplex a);
 /**
 * @brief    Host side function for scale a cuDoubleComplex object
 * 
 * @param s  scale factor
 * @param a  Complex number
 * @return cuDoubleComplex  scaled complex number
 */
 cuDoubleComplex clcg_Zscale(lcg_float s, cuDoubleComplex a);
 /**
 * @brief   Calculate the sum of two cuda complex number. This is a host side function.
 * 
 * @param a Complex number
 * @param b Complex number
 * @return cuDoubleComplex Sum of the input complex number 
 */
 cuDoubleComplex clcg_Zsum(cuDoubleComplex a, cuDoubleComplex b);
 /**
 * @brief   Calculate the difference of two cuda complex number. This is a host side function.
 * 
 * @param a Complex number
 * @param b Complex number
 * @return cuDoubleComplex Difference of the input complex number 
 */
 cuDoubleComplex clcg_Zdiff(cuDoubleComplex a, cuDoubleComplex b);
 /**
 * @brief   Calculate the sqrt() of a cuda complex number
 * 
 * @param a Complex number
 * @return cuDoubleComplex root value
 */
 cuDoubleComplex clcg_Zsqrt(cuDoubleComplex a);
 /**
 * @brief   Convert the indexing sequence of a sparse matrix from the row-major to col-major format.
 * 
 * @note    The sparse matrix is stored in the COO foramt. This is a host side function.
 * 
 * @param A_row      Row index
 * @param A_col      Column index
 * @param A          Non-zero values of the matrix
 * @param N          Row/column length of A
 * @param nz         Number of the non-zero values in A
 * @param Ac_row     Output row index
 * @param Ac_col     Output column index
 * @param Ac_val     Non-zero values of the output matrix
 */
 void clcg_smCcoo_row2col(const int *A_row, const int *A_col, const cuComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuComplex *Ac_val);
 /**
 * @brief   Convert the indexing sequence of a sparse matrix from the row-major to col-major format.
 * 
 * @note    The sparse matrix is stored in the COO foramt. This is a host side function.
 * 
 * @param A_row      Row index
 * @param A_col      Column index
 * @param A          Non-zero values of the matrix
 * @param N          Row/column length of A
 * @param nz         Number of the non-zero values in A
 * @param Ac_row     Output row index
 * @param Ac_col     Output column index
 * @param Ac_val     Non-zero values of the output matrix
 */
 void clcg_smZcoo_row2col(const int *A_row, const int *A_col, const cuDoubleComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuDoubleComplex *Ac_val);
 /**
 * @brief      Extract diagonal elements from a square CUDA sparse matrix that is formatted in the CSR format
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  A_ptr   Row index pointer
 * @param[in]  A_col   Column index
 * @param[in]  A_val   Non-zero values of the matrix
 * @param[in]  A_len   Dimension of the matrix
 * @param      A_diag  Output digonal elements
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_smCcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuComplex *A_val, const int A_len, cuComplex *A_diag, int bk_size = 1024);
 /**
 * @brief      Extract diagonal elements from a square CUDA sparse matrix that is formatted in the CSR format
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  A_ptr   Row index pointer
 * @param[in]  A_col   Column index
 * @param[in]  A_val   Non-zero values of the matrix
 * @param[in]  A_len   Dimension of the matrix
 * @param      A_diag  Output digonal elements
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_smZcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuDoubleComplex *A_val, const int A_len, cuDoubleComplex *A_diag, int bk_size = 1024);
 /**
 * @brief      Element-wise muplication between two CUDA arries.
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  a     Pointer of the input array
 * @param[in]  b     Pointer of the input array
 * @param      c     Pointer of the output array
 * @param[in]  n     Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_vecMvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size = 1024);
 /**
 * @brief      Element-wise muplication between two CUDA arries.
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  a     Pointer of the input array
 * @param[in]  b     Pointer of the input array
 * @param      c     Pointer of the output array
 * @param[in]  n     Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_vecMvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size = 1024);
 /**
 * @brief      Element-wise division between two CUDA arries.
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  a     Pointer of the input array
 * @param[in]  b     Pointer of the input array
 * @param      c     Pointer of the output array
 * @param[in]  n     Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_vecDvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size = 1024);
 /**
 * @brief      Element-wise division between two CUDA arries.
 * 
 * @note       This is a device side function. All memories must be allocated on the GPU device.
 *
 * @param[in]  a     Pointer of the input array
 * @param[in]  b     Pointer of the input array
 * @param      c     Pointer of the output array
 * @param[in]  n     Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_vecDvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size = 1024);
 /**
 * @brief      Return complex conjugates of an input CUDA complex array
 * 
 * @param a    Pointer of the input arra
 * @param ca   Pointer of the output array
 * @param n    Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_vecC_conjugate(const cuComplex *a, cuComplex *ca, int n, int bk_size = 1024);
 /**
 * @brief      Return complex conjugates of an input CUDA complex array
 * 
 * @param a    Pointer of the input arra
 * @param ca   Pointer of the output array
 * @param n    Length of the arraies
 * @param[in]  bk_size Default CUDA block size.
 */
 void clcg_vecZ_conjugate(const cuDoubleComplex *a, cuDoubleComplex *ca, int n, int bk_size = 1024);
 #endif // LibLCG_CUDA
 #endif // _LCG_COMPLEX_CUDA_H
--- a/src/lib/lcg_cuda.cu
+++ b/src/lib/lcg_cuda.cu
@@ -0,0 +1,685 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "cmath"
 #include "ctime"
 #include "iostream"
 #include "lcg_cuda.h"
 typedef int (*lcg_solver_cuda_ptr)(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
    const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int lcg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, 
    const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int lcgs(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, 
    const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int lcg_solver_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, 
    const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id)
 {
 	lcg_solver_cuda_ptr cg_solver_cuda;
 	switch (solver_id)
 	{
 		case LCG_CG:
 			cg_solver_cuda = lcg;
 			break;
 		case LCG_CGS:
 			cg_solver_cuda = lcgs;
 			break;
 		default:
 			cg_solver_cuda = lcg;
            break;
 	}
 	return cg_solver_cuda(Afp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 int lpcg(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
 	const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int lcg_solver_preconditioned_cuda(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, 
    lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id)
 {
 	return lpcg(Afp, Mfp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 int lpg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
 	const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, 
 	void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 int lcg_solver_constrained_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
    const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id)
 {
 	return lpg(Afp, Pfp, m, B, low, hig, n_size, nz_size, param, instance, cub_handle, cus_handle);
 }
 int lcg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
    const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	// set CG parameters
 	lcg_para para = (param != nullptr) ? (*param) : defparam;
 	//check parameters
 	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
 	if (m == nullptr) return LCG_INVALID_POINTER;
 	if (B == nullptr) return LCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	// locate memory
 	lcg_float *d_m = nullptr, *d_B = nullptr;
 	lcg_float *gk = nullptr, *dk = nullptr, *Adk = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
 	cudaMalloc(&gk, n_size * sizeof(lcg_float));
    cudaMalloc(&dk, n_size * sizeof(lcg_float));
    cudaMalloc(&Adk, n_size * sizeof(lcg_float));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cusparseDnVecDescr_t dvec_m, dvec_dk, dvec_Adk;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_Adk, n_size, Adk, CUDA_R_64F);
    lcg_float none = -1.0;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Adk, n_size, nz_size);
    // g0 = Ax - B
    cudaMemcpy(gk, Adk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // g0 = A*x
    cublasDaxpy_v2(cub_handle, n_size, &none, d_B, 1, gk, 1); // g0 -= B
    cudaMemset(dk, 0, n_size * sizeof(lcg_float)); // d0 = 0
    cublasDaxpy_v2(cub_handle, n_size, &none, gk, 1, dk, 1); // d0 = -g0
 	lcg_float gk_mod;
    cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk_mod); // gk_mod = ||gk||
 	lcg_float g0_mod = gk_mod;
 	if (g0_mod < 1.0) g0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(gk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, sqrt(gk_mod)/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	else if (gk_mod/g0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, gk_mod/g0_mod, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float dTAd, ak, betak, gk1_mod, residual;
 	while (1)
 	{
 		if (para.abs_diff) residual = sqrt(gk_mod)/n_size;
 		else residual = gk_mod/g0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = LCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = LCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Adk, n_size, nz_size);
        cublasDdot_v2(cub_handle, n_size, dk, 1, Adk, 1, &dTAd); // dTAd = dk^T * Adk
 		ak = gk_mod/dTAd;
        cublasDaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1); // m += ak*dk
        cublasDaxpy_v2(cub_handle, n_size, &ak, Adk, 1, gk, 1); // gk += ak*Adk
        cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk1_mod); // gk1_mod = ||gk||
 		betak = gk1_mod/gk_mod;
 		gk_mod = gk1_mod;
        cublasDscal_v2(cub_handle, n_size, &betak, dk, 1); // dk *= betak
        cublasDaxpy_v2(cub_handle, n_size, &none, gk, 1, dk, 1); // dk -= gk
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
        cudaFree(dk);
        cudaFree(gk);
        cudaFree(Adk);
 		cusparseDestroyDnVec(dvec_m);
 		cusparseDestroyDnVec(dvec_dk);
 		cusparseDestroyDnVec(dvec_Adk);
 	}
 	return ret;
 }
 int lcgs(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
    const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	// set CG parameters
 	lcg_para para = (param != nullptr) ? (*param) : defparam;
 	//check parameters
 	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
 	if (m == nullptr) return LCG_INVALID_POINTER;
 	if (B == nullptr) return LCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	// locate memory
 	lcg_float *d_m = nullptr, *d_B = nullptr;
 	lcg_float *rk = nullptr, *r0T = nullptr, *pk = nullptr, *qpk = nullptr;
 	lcg_float *Ax = nullptr, *uk = nullptr,   *qk = nullptr, *wk = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
 	cudaMalloc(&rk, n_size * sizeof(lcg_float));
    cudaMalloc(&r0T, n_size * sizeof(lcg_float));
    cudaMalloc(&pk, n_size * sizeof(lcg_float));
 	cudaMalloc(&qpk, n_size * sizeof(lcg_float));
 	cudaMalloc(&Ax, n_size * sizeof(lcg_float));
 	cudaMalloc(&uk, n_size * sizeof(lcg_float));
 	cudaMalloc(&qk, n_size * sizeof(lcg_float));
 	cudaMalloc(&wk, n_size * sizeof(lcg_float));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cusparseDnVecDescr_t dvec_m, dvec_wk, dvec_pk, dvec_Ax;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_wk, n_size, wk, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_pk, n_size, pk, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_R_64F);
 	lcg_float one = 1.0;
    lcg_float none = -1.0;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size);
    // r0 = B - Ax
 	cudaMemcpy(rk, d_B, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // r0 = B
 	cublasDaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
    // p0 = u0 = r0T = r0
 	cudaMemcpy(pk, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 	cudaMemcpy(uk, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 	cudaMemcpy(r0T, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 	lcg_float rkr0T;
 	cublasDdot_v2(cub_handle, n_size, rk, 1, r0T, 1, &rkr0T);
 	lcg_float rk_mod;
    cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod); // rk_mod = ||rk||
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, sqrt(rk_mod)/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/r0_mod, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float ak, nak, rkr0T1, AprT, betak, residual;
 	while (1)
 	{
 		if (para.abs_diff) residual = sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = LCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = LCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_pk, dvec_Ax, n_size, nz_size);
 		AprT = 0.0;
 		cublasDdot_v2(cub_handle, n_size, r0T, 1, Ax, 1, &AprT);
 		ak = rkr0T/AprT;
 		nak = -1.0*ak;
 		cudaMemcpy(qk, uk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cudaMemcpy(wk, uk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
        cublasDaxpy_v2(cub_handle, n_size, &nak, Ax, 1, qk, 1);
        cublasDaxpy_v2(cub_handle, n_size, &one, qk, 1, wk, 1);
 		Afp(instance, cub_handle, cus_handle, dvec_wk, dvec_Ax, n_size, nz_size);
 		cublasDaxpy_v2(cub_handle, n_size, &ak, wk, 1, d_m, 1);
        cublasDaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
        cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod);
 		cublasDdot_v2(cub_handle, n_size, rk, 1, r0T, 1, &rkr0T1);
 		betak = rkr0T1/rkr0T;
 		rkr0T = rkr0T1;
 		cudaMemcpy(uk, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cublasDaxpy_v2(cub_handle, n_size, &betak, qk, 1, uk, 1);
 		cudaMemcpy(qpk, qk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cublasDaxpy_v2(cub_handle, n_size, &betak, pk, 1, qpk, 1);
 		cudaMemcpy(pk, uk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cublasDaxpy_v2(cub_handle, n_size, &betak, qpk, 1, pk, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
        cudaFree(rk);
 		cudaFree(r0T);
 		cudaFree(pk);
 		cudaFree(qpk);
 		cudaFree(Ax);
 		cudaFree(uk);
 		cudaFree(qk);
 		cudaFree(wk);
 		cusparseDestroyDnVec(dvec_m);
 		cusparseDestroyDnVec(dvec_wk);
 		cusparseDestroyDnVec(dvec_pk);
 		cusparseDestroyDnVec(dvec_Ax);
 	}
 	return ret;
 }
 int lpcg(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
 	const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	// set CG parameters
 	lcg_para para = (param != nullptr) ? (*param) : defparam;
 	//check parameters
 	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
 	if (m == nullptr) return LCG_INVALID_POINTER;
 	if (B == nullptr) return LCG_INVALID_POINTER;
    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	// locate memory
 	lcg_float *d_m = nullptr, *d_B = nullptr;
 	lcg_float *rk = nullptr, *zk = nullptr, *dk = nullptr, *Adk = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
 	cudaMalloc(&rk, n_size * sizeof(lcg_float));
    cudaMalloc(&zk, n_size * sizeof(lcg_float));
    cudaMalloc(&dk, n_size * sizeof(lcg_float));
 	cudaMalloc(&Adk, n_size * sizeof(lcg_float));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cusparseDnVecDescr_t dvec_m, dvec_rk, dvec_zk, dvec_dk, dvec_Adk;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_rk, n_size, rk, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_zk, n_size, zk, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_Adk, n_size, Adk, CUDA_R_64F);
 	lcg_float one = 1.0;
    lcg_float none = -1.0;
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Adk, n_size, nz_size);
    // r0 = B - Ax
 	cudaMemcpy(rk, d_B, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // r0 = B
 	cublasDaxpy_v2(cub_handle, n_size, &none, Adk, 1, rk, 1); // r0 -= Ax
 	Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_zk, n_size, nz_size);
    // d0 = z0
 	cudaMemcpy(dk, zk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 	lcg_float rk_mod;
    cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod); // rk_mod = ||rk||
 	lcg_float r0_mod = rk_mod;
 	if (r0_mod < 1.0) r0_mod = 1.0;
 	lcg_float zTr;
 	cublasDdot_v2(cub_handle, n_size, zk, 1, rk, 1, &zTr);
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, sqrt(rk_mod)/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	else if (rk_mod/r0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, rk_mod/r0_mod, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float dTAd, ak, nak, betak, zTr1, residual;
 	while (1)
 	{
 		if (para.abs_diff) residual = sqrt(rk_mod)/n_size;
 		else residual = rk_mod/r0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = LCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = LCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Adk, n_size, nz_size);
 		cublasDdot_v2(cub_handle, n_size, dk, 1, Adk, 1, &dTAd);
 		ak = zTr/dTAd;
 		nak = -1.0*ak;
        cublasDaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
        cublasDaxpy_v2(cub_handle, n_size, &nak, Adk, 1, rk, 1);
 		Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_zk, n_size, nz_size);
        cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod);
 		cublasDdot_v2(cub_handle, n_size, zk, 1, rk, 1, &zTr1);
 		betak = zTr1/zTr;
 		zTr = zTr1;
 		cublasDscal_v2(cub_handle, n_size, &betak, dk, 1); // dk *= betak
 		cublasDaxpy_v2(cub_handle, n_size, &one, zk, 1, dk, 1);
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
        cudaFree(rk);
 		cudaFree(zk);
 		cudaFree(dk);
 		cudaFree(Adk);
 		cusparseDestroyDnVec(dvec_m);
 		cusparseDestroyDnVec(dvec_rk);
 		cusparseDestroyDnVec(dvec_zk);
 		cusparseDestroyDnVec(dvec_dk);
 		cusparseDestroyDnVec(dvec_Adk);
 	}
 	return ret;
 }
 int lpg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
 	const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, 
 	void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	// set CG parameters
 	lcg_para para = (param != nullptr) ? (*param) : defparam;
 	// check parameters
 	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
 	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
 	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
 	if (para.step <= 0.0) return LCG_INVALID_LAMBDA;
 	if (m == nullptr) return LCG_INVALID_POINTER;
 	if (B == nullptr) return LCG_INVALID_POINTER;
 	if (low == nullptr) return LCG_INVALID_POINTER;
 	if (hig == nullptr) return LCG_INVALID_POINTER;
 	if (cub_handle == nullptr) return LCG_INVALID_POINTER;
    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
 	// locate memory
 	lcg_float *d_m = nullptr, *d_B = nullptr;
 	lcg_float *gk = nullptr, *Adk = nullptr;
 	lcg_float *m_new = nullptr, *gk_new = nullptr;
 	lcg_float *sk = nullptr, *yk = nullptr;
 	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
 	cudaMalloc(&gk, n_size *sizeof(lcg_float));
 	cudaMalloc(&Adk, n_size *sizeof(lcg_float));
 	cudaMalloc(&m_new, n_size *sizeof(lcg_float));
 	cudaMalloc(&gk_new, n_size *sizeof(lcg_float));
 	cudaMalloc(&sk, n_size *sizeof(lcg_float));
 	cudaMalloc(&yk, n_size *sizeof(lcg_float));
 	// Copy initial solutions
 	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
 	cusparseDnVecDescr_t dvec_m, dvec_mnew, dvec_Adk;
 	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_mnew, n_size, m_new, CUDA_R_64F);
 	cusparseCreateDnVec(&dvec_Adk, n_size, Adk, CUDA_R_64F);
 	lcg_float none = -1.0;
 	lcg_float nalpha_k, alpha_k = para.step;
 	lcg_set2box_cuda(low, hig, m, n_size);
 	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Adk, n_size, nz_size);
 	// g0 = Ax - B
    cudaMemcpy(gk, Adk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // g0 = A*x
    cublasDaxpy_v2(cub_handle, n_size, &none, d_B, 1, gk, 1); // g0 -= B
 	lcg_float gk_mod;
    cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk_mod); // gk_mod = ||gk||
 	lcg_float g0_mod = gk_mod;
 	if (g0_mod < 1.0) g0_mod = 1.0;
 	int ret, t = 0;
 	if (para.abs_diff && sqrt(gk_mod)/n_size <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, sqrt(gk_mod)/n_size, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	else if (gk_mod/g0_mod <= para.epsilon)
 	{
 		ret = LCG_ALREADY_OPTIMIZIED;
 		if (Pfp != nullptr)
 		{
 			Pfp(instance, d_m, gk_mod/g0_mod, &para, n_size, nz_size, 0);
 		}
 		goto func_ends;
 	}
 	lcg_float sk_mod, syk_mod, residual;
 	while(1)
 	{
 		if (para.abs_diff) residual = sqrt(gk_mod)/n_size;
 		else residual = gk_mod/g0_mod;
 		if (Pfp != nullptr)
 		{
 			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
 			{
 				ret = LCG_STOP; goto func_ends;
 			}
 		}
 		if (residual <= para.epsilon)
 		{
 			ret = LCG_CONVERGENCE; goto func_ends;
 		}
 		if (para.max_iterations > 0 && t+1 > para.max_iterations)
 		{
 			ret = LCG_REACHED_MAX_ITERATIONS;
 			break;
 		}
 		t++;
 		nalpha_k = -1.0*alpha_k;
 		cudaMemcpy(m_new, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cublasDaxpy_v2(cub_handle, n_size, &nalpha_k, gk, 1, m_new, 1);
 		lcg_set2box_cuda(low, hig, m_new, n_size);
 		Afp(instance, cub_handle, cus_handle, dvec_mnew, dvec_Adk, n_size, nz_size);
 		cudaMemcpy(gk_new, Adk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // g0 = A*x
    	cublasDaxpy_v2(cub_handle, n_size, &none, d_B, 1, gk, 1); // g0 -= B
 		cudaMemcpy(sk, m_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cublasDaxpy_v2(cub_handle, n_size, &none, d_m, 1, sk, 1);
 		cudaMemcpy(yk, gk_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cublasDaxpy_v2(cub_handle, n_size, &none, gk, 1, sk, 1);
 		cublasDdot_v2(cub_handle, n_size, sk, 1, sk, 1, &sk_mod);
 		cublasDdot_v2(cub_handle, n_size, sk, 1, yk, 1, &syk_mod);
 		alpha_k = sk_mod/syk_mod;
 		cudaMemcpy(d_m, m_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		cudaMemcpy(gk, gk_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 		lcg_float gk_mod;
 		cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk_mod); // gk_mod = ||gk||
 	}
 	func_ends:
 	{
 		// Copy to host memories
 		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
 		cudaFree(d_m);
 		cudaFree(d_B);
 		cudaFree(gk);
 		cudaFree(gk_new);
 		cudaFree(m_new);
 		cudaFree(sk);
 		cudaFree(yk);
 		cudaFree(Adk);
 		cusparseDestroyDnVec(dvec_m);
 		cusparseDestroyDnVec(dvec_mnew);
 		cusparseDestroyDnVec(dvec_Adk);
 	}
 	return ret;
 }
--- a/src/lib/lcg_cuda.h
+++ b/src/lib/lcg_cuda.h
@@ -0,0 +1,135 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _LCG_CUDA_H
 #define _LCG_CUDA_H
 #include "util.h"
 #include "algebra_cuda.h"
 #ifdef LibLCG_CUDA
 #include <cublas_v2.h>
 #include <cusparse_v2.h>
 /**
 * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
 * by a vertical vector 'x'. Note that both A and x are hosted on the GPU device.
 * 
 * @param  instance    The user data sent for the lcg_solver_cuda() functions by the client.
 * @param  cub_handle  Handler of the cublas object.
 * @param  cus_handle  Handlee of the cusparse object.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 * @param  n_size      Size of x and column/row numbers of A.
 */
 typedef void (*lcg_axfunc_cuda_ptr)(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary. Note that m is hosted on the GPU device.
 * 
 * @param    instance    The user data sent for the lcg_solver() functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    n_size      The size of the variables
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*lcg_progress_cuda_ptr)(void* instance, const lcg_float* m, const lcg_float converge, 
 	const lcg_para* param, const int n_size, const int nz_size, const int k);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int lcg_solver_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
    const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, 
    cusparseHandle_t cus_handle, lcg_solver_enum solver_id = LCG_CG);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param[in]  nz_size     Size of the non-zero element of a cusparse object.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int lcg_solver_preconditioned_cuda(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, 
    lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id = LCG_PCG);
 /**
 * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      low         Lower bound of the acceptable solution.
 * @param      hig         Higher bound of the acceptable solution.
 * @param      B           Objective vector of the linear system.
 * @param[in]  n_size      Size of the solution vector and objective vector.
 * @param[in]  nz_size     Size of the non-zero element of a cusparse object.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * @param      cub_handle  Handler of the cublas object.
 * @param      cus_handle  Handlee of the cusparse object.
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int lcg_solver_constrained_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
    const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id = LCG_PG);
 #endif // LibLCG_CUDA
 #endif // _LCG_CUDA_H
--- a/src/lib/lcg_eigen.cpp
+++ b/src/lib/lcg_eigen.cpp
--- a/src/lib/lcg_eigen.h
+++ b/src/lib/lcg_eigen.h
@@ -0,0 +1,110 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _LCG_EIGEN_H
 #define _LCG_EIGEN_H
 #include "util.h"
 #include "algebra_eigen.h"
 /**
 * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
 * by a vertical vector 'x'.
 * 
 * @param  instance    The user data sent for the lcg_solver() functions by the client.
 * @param  x           Multiplier of the Ax product.
 * @param  Ax          Product of A multiplied by x.
 */
 typedef void (*lcg_axfunc_eigen_ptr)(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax);
 /**
 * @brief     Callback interface for monitoring the progress and terminate the iteration 
 * if necessary.
 * 
 * @param    instance    The user data sent for the lcg_solver() functions by the client.
 * @param    m           The current solutions.
 * @param    converge    The current value evaluating the iteration progress.
 * @param    k           The iteration count.
 * 
 * @retval   int         Zero to continue the optimization process. Returning a
 *                       non-zero value will terminate the optimization process.
 */
 typedef int (*lcg_progress_eigen_ptr)(void* instance, const Eigen::VectorXd *m, const lcg_float converge, 
 	const lcg_para *param, const int k);
 /**
 * @brief      A combined conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 *
 * @return     Status of the function.
 */
 int lcg_solver_eigen(lcg_axfunc_eigen_ptr Afp, lcg_progress_eigen_ptr Pfp, Eigen::VectorXd &m, 
 	const Eigen::VectorXd &B, const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_CG);
 /**
 * @brief      A combined conjugate gradient solver function.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Mfp         Callback function for calculating the product of 'M^{-1}x', in which M is the preconditioning matrix.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_PCG.
 *
 * @return     Status of the function.
 */
 int lcg_solver_preconditioned_eigen(lcg_axfunc_eigen_ptr Afp, lcg_axfunc_eigen_ptr Mfp, lcg_progress_eigen_ptr Pfp, 
 	Eigen::VectorXd &m, const Eigen::VectorXd &B, const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_PCG);
 /**
 * @brief      A combined conjugate gradient solver function with inequality constraints.
 *
 * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
 * @param[in]  Pfp         Callback function for monitoring the iteration progress.
 * @param      m           Initial solution vector.
 * @param      B           Objective vector of the linear system.
 * @param[in]  low         The lower boundary of the acceptable solution.
 * @param[in]  hig         The higher boundary of the acceptable solution.
 * @param      param       Parameter setup for the conjugate gradient methods.
 * @param      instance    The user data sent for the lcg_solver() function by the client. 
 * This variable is either 'this' for class member functions or 'NULL' for global functions.
 * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
 * @param      P           Precondition vector (optional expect for the LCG_PCG method). The default value is NULL.
 *
 * @return     Status of the function.
 */
 int lcg_solver_constrained_eigen(lcg_axfunc_eigen_ptr Afp, lcg_progress_eigen_ptr Pfp, Eigen::VectorXd &m, 
 	const Eigen::VectorXd &B, const Eigen::VectorXd &low, const Eigen::VectorXd &hig, 
 	const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_PG);
 #endif //_LCG_EIGEN_H
--- a/src/lib/preconditioner.cpp
+++ b/src/lib/preconditioner.cpp
@@ -0,0 +1,381 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "preconditioner.h"
 #include "cmath"
 #include "map"
 void lcg_incomplete_Cholesky_half_buffsize_coo(const int *row, const int *col, int nz_size, int *lnz_size)
 {
    size_t c = 0;
    for (size_t i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            c++;
        }
    }
    *lnz_size = c;
    return;
 }
 void lcg_incomplete_Cholesky_half_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, 
    int lnz_size, int *IC_row, int *IC_col, lcg_float *IC_val)
 {
    // We use this to store diagonal elements of the factorizated lower triangular matrix
    lcg_float *diagonal = new lcg_float [N];
    // A temporary row
    lcg_float *tmp_row = new lcg_float [N];
    // index of non-zero elements in tmp_row
    int *filled_idx = new int [N];
    // Begining index of each row in the input matrix
    int *row_st_idx = new int [N];
    size_t i, j, f;
    // Set initial values
    for (i = 0; i < N; i++)
    {
        diagonal[i] = 0.0;
        tmp_row[i] = 0.0;
        filled_idx[i] = -1;
        row_st_idx[i] = -1;
    }
    // copy elements in the lower triangle to the output matrix
    j = 0;
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            IC_row[j] = row[i];
            IC_col[j] = col[i];
            IC_val[j] = val[i];
            j++;
        }
    }
    // Get the begining index of each row in the matrix
    j = 1;
    row_st_idx[0] = IC_row[0];
    size_t old_row = IC_row[0];
    for (i = 1; i < lnz_size; i++)
    {
        if (IC_row[i] > old_row)
        {
            row_st_idx[j] = i;
            old_row = IC_row[i];
            j++;
        }
    }
    // Calculate the first element
    IC_val[0] = sqrt(IC_val[0]);
    diagonal[0] = IC_val[0];
    lcg_float dia_sum;
    dia_sum = 0.0;
    // The first one is already calculated
    for (i = 1; i < lnz_size; i++)
    {
        // Calculate the first column if there is one
        if (IC_col[i] == 0)
        {
            IC_val[i] = IC_val[i]/IC_val[0];
            dia_sum = dia_sum + IC_val[i]*IC_val[i];
            continue; // Case 1 break
        }
        // Calculate elements in the middle of a row
        if (IC_row[i] > IC_col[i])
        {
            // Find needed values from previous elements
            f = 0;
            j = row_st_idx[IC_col[i]];
            while (IC_col[j] < IC_col[i])
            {
                tmp_row[IC_col[j]] = IC_val[j];
                filled_idx[f]  = IC_col[j];
                f++;
                j++;
            }
            j = row_st_idx[IC_row[i]];
            while (IC_col[j] < IC_col[i])
            {
                IC_val[i] = IC_val[i] - IC_val[j]*tmp_row[IC_col[j]];
                j++;
            }
            IC_val[i] = IC_val[i]/diagonal[IC_col[i]];
            dia_sum = dia_sum + IC_val[i]*IC_val[i];
            // reset tmp variables
            for (j = 0; j < f; j++)
            {
                tmp_row[filled_idx[j]] = 0.0;
            }
            continue; // Case 2 break
        }
        // We have rearched the diagonal position
        if (IC_row[i] == IC_col[i])
        {
            IC_val[i] = sqrt(IC_val[i] - dia_sum);
            diagonal[IC_col[i]] = IC_val[i];
            dia_sum = 0.0;
        }
    }
    delete[] diagonal;
    delete[] tmp_row;
    delete[] row_st_idx;
    delete[] filled_idx;
    return;
 }
 void lcg_incomplete_Cholesky_full_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, int *IC_row, int *IC_col, lcg_float *IC_val)
 {
    // We use this to store diagonal elements of the factorizated lower triangular matrix
    lcg_float *diagonal = new lcg_float [N];
    // A temporary row
    lcg_float *tmp_row = new lcg_float [N];
    // index of non-zero elements in tmp_row
    int *filled_idx = new int [N];
    // Begining index of each row in the input matrix
    int *row_st_idx = new int [N];
    size_t i, j, f, l;
    // Set initial values
    for (i = 0; i < N; i++)
    {
        diagonal[i] = 0.0;
        tmp_row[i] = 0.0;
        filled_idx[i] = -1;
        row_st_idx[i] = -1;
    }
    // copy elements to the output matrix
    for (i = 0; i < nz_size; i++)
    {
        IC_row[i] = row[i];
        IC_col[i] = col[i];
        IC_val[i] = val[i];
    }
    // count element number in the lower triangular part (including the diagonal) and the upper triangular part (excluding the diagonal)
    // build map from elements' cooridnate to their index in the array
    size_t order, L_nz = 0;
    std::map<size_t, size_t> index_map;
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i]) // Count number for thr lower triangular part
        {
            L_nz++;
        }
        else // Only need to build the map for the upper triangular part
        {
            order = N*row[i] + col[i];
            index_map[order] = i;
        }
    }
    // We use to store element index in the lower triangle
    j = 0;
    size_t *low_idx = new size_t [L_nz];
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            low_idx[j] = i;
            j++;
        }
    }
    // Get the begining index of each row in the matrix
    j = 1;
    row_st_idx[0] = IC_row[0];
    size_t old_row = IC_row[0];
    for (i = 1; i < nz_size; i++)
    {
        if (IC_row[i] > old_row)
        {
            row_st_idx[j] = i;
            old_row = IC_row[i];
            j++;
        }
    }
    // Calculate the first element
    IC_val[0] = sqrt(IC_val[0]);
    diagonal[0] = IC_val[0];
    lcg_float dia_sum;
    dia_sum = 0.0;
    // The first one is already calculated
    for (i = 1; i < L_nz; i++)
    {
        l = low_idx[i];
        // Calculate the first column if there is one
        if (IC_col[l] == 0)
        {
            IC_val[l] = IC_val[l]/IC_val[0];
            dia_sum = dia_sum + IC_val[l]*IC_val[l];
            // Set value at the upper triangle
            order = IC_row[l];
            IC_val[index_map[order]] = IC_val[l];
            continue; // Case 1 break
        }
        // Calculate elements in the middle of a row
        if (IC_row[l] > IC_col[l])
        {
            // Find needed values from previous elements
            f = 0;
            j = row_st_idx[IC_col[l]];
            while (IC_col[j] < IC_col[l])
            {
                tmp_row[IC_col[j]] = IC_val[j];
                filled_idx[f]  = IC_col[j];
                f++;
                j++;
            }
            j = row_st_idx[IC_row[l]];
            while (IC_col[j] < IC_col[l])
            {
                IC_val[l] = IC_val[l] - IC_val[j]*tmp_row[IC_col[j]];
                j++;
            }
            IC_val[l] = IC_val[l]/diagonal[IC_col[l]];
            dia_sum = dia_sum + IC_val[l]*IC_val[l];
            // Set value at the upper triangle
            order = N*IC_col[l] + IC_row[l];
            IC_val[index_map[order]] = IC_val[l];
            // reset tmp variables
            for (j = 0; j < f; j++)
            {
                tmp_row[filled_idx[j]] = 0.0;
            }
            continue; // Case 2 break
        }
        // We have rearched the diagonal position
        if (IC_row[l] == IC_col[l])
        {
            IC_val[l] = sqrt(IC_val[l] - dia_sum);
            diagonal[IC_col[l]] = IC_val[l];
            dia_sum = 0.0;
        }
    }
    delete[] diagonal;
    delete[] tmp_row;
    delete[] row_st_idx;
    delete[] filled_idx;
    delete[] low_idx;
    index_map.clear();
    return;
 }
 void lcg_solve_upper_triangle_coo(const int *row, const int *col, const lcg_float *U, const lcg_float *B, lcg_float *x, int N, int nz_size)
 {
    for (size_t i = 0; i < N; i++)
    {
        x[i] = 0.0;
    }
    size_t iter = nz_size - 1;
    double sum;
    for (size_t i = N-1; i >= 0; i--)
    {
        sum = 0.0;
        for (size_t j = iter; j >= 0; j--)
        {
            if (row[j] == i && col[j] > i)
            {
                sum += U[j] * x[col[j]];
            }
            else if (row[j] == i && col[j] == i)
            {
                x[i] = (B[i] - sum)/U[j];
                if (j == 0) return;
                else iter = j-1;
                break;
            }
        }
    }
    return;
 }
 void lcg_solve_lower_triangle_coo(const int *row, const int *col, const lcg_float *L, const lcg_float *B, lcg_float *x, int N, int nz_size)
 {
    for (size_t i = 0; i < N; i++)
    {
        x[i] = 0.0;
    }
    size_t iter = 0;
    double sum;
    for (size_t i = 0; i < N; i++)
    {
        sum = 0.0;
        for (size_t j = iter; j < nz_size; j++)
        {
            if (row[j] == i && col[j] < i)
            {
                sum += L[j] * x[col[j]];
            }
            else if (row[j] == i && col[j] == i)
            {
                x[i] = (B[i] - sum)/L[j];
                iter = j+1;
                break;
            }
        }
    }
    return;
 }
 bool lcg_full_rank_coo(const int *row, const int *col, const lcg_float *M, int N, int nz_size)
 {
    size_t s = 0;
    for (size_t i = 0; i < nz_size; i++)
    {
        if (row[i] == col[i] && M[i] != 0.0)
        {
            s++;
        }
    }
    if (s == N) return true;
    else return false;
 }
--- a/src/lib/preconditioner.h
+++ b/src/lib/preconditioner.h
@@ -0,0 +1,110 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _PRECONDITIONER_H
 #define _PRECONDITIONER_H
 #include "algebra.h"
 /**
 * @brief Return the number of non-zero elements in the lower triangular part of the input matrix
 * 
 * @param row[in]        Row index of the input sparse matrix.
 * @param col[in]        Column index of the input sparse matrix.
 * @param nz_size[in]    Length of the non-zero elements.
 * @param lnz_size[out]  Legnth of the non-zero elements in the lower triangle
 */
 void lcg_incomplete_Cholesky_half_buffsize_coo(const int *row, const int *col, int nz_size, int *lnz_size);
 /**
 * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
 * 
 * @note  Only the factorized lower triangular matrix is stored in the lower part of the output matrix accordingly.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param val        Non-zero values of the input sparse matrix.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zero elements.
 * @param lnz_size   Legnth of the non-zero elements in the lower triangle
 * @param IC_row     Row index of the factorized triangular sparse matrix.
 * @param IC_col     Column index of the factorized triangular sparse matrix.
 * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
 */
 void lcg_incomplete_Cholesky_half_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, int lnz_size, int *IC_row, int *IC_col, lcg_float *IC_val);
 /**
 * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
 * 
 * @note  The factorized lower and upper triangular matrixes are stored in the lower and upper triangular parts of the output matrix accordingly.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param val        Non-zero values of the input sparse matrix.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zeor elements.
 * @param IC_row     Row index of the factorized triangular sparse matrix.
 * @param IC_col     Column index of the factorized triangular sparse matrix.
 * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
 */
 void lcg_incomplete_Cholesky_full_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, int *IC_row, int *IC_col, lcg_float *IC_val);
 /**
 * @brief Solve the linear system Ux = B, in which U is a upper triangle matrix.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param U          Non-zero values of the input sparse matrix.
 * @param B          Object array.
 * @param x          The returned solution.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zeor elements.
 */
 void lcg_solve_upper_triangle_coo(const int *row, const int *col, const lcg_float *U, const lcg_float *B, lcg_float *x, int N, int nz_size);
 /**
 * @brief Solve the linear system Lx = B, in which L is a lower triangle matrix.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param L          Non-zero values of the input sparse matrix.
 * @param B          Object array.
 * @param x          The returned solution.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zeor elements.
 */
 void lcg_solve_lower_triangle_coo(const int *row, const int *col, const lcg_float *L, const lcg_float *B, lcg_float *x, int N, int nz_size);
 /**
 * @brief Check to see if a square matrix is full ranked or not. The sparse matrix is stored in the COO format.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param M          Non-zero values of the input sparse matrix.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zeor elements.
 * @return true      The matrix is full ranked.
 * @return false     The matrix is not full ranked.
 */
 bool lcg_full_rank_coo(const int *row, const int *col, const lcg_float *M, int N, int nz_size);
 #endif // _PRECONDITIONER_H
--- a/src/lib/preconditioner_cuda.cu
+++ b/src/lib/preconditioner_cuda.cu
@@ -0,0 +1,421 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "preconditioner_cuda.h"
 #include "map"
 void clcg_incomplete_Cholesky_cuda_half_buffsize(const int *row, const int *col, int nz_size, int *lnz_size)
 {
    size_t c = 0;
    for (size_t i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            c++;
        }
    }
    *lnz_size = c;
    return;
 }
 void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuComplex *val, int N, int nz_size, 
    int lnz_size, int *IC_row, int *IC_col, cuComplex *IC_val)
 {
    // We use this to store diagonal elements of the factorizated lower triangular matrix
    cuComplex *diagonal = new cuComplex [N];
    // A temporary row
    cuComplex *tmp_row = new cuComplex [N];
    // index of non-zero elements in tmp_row
    int *filled_idx = new int [N];
    // Begining index of each row in the input matrix
    int *row_st_idx = new int [N];
    size_t i, j, f;
    // Set initial values
    for (i = 0; i < N; i++)
    {
        diagonal[i].x = 0.0; diagonal[i].y = 0.0;
        tmp_row[i].x = 0.0; tmp_row[i].y = 0.0;
        filled_idx[i] = -1;
        row_st_idx[i] = -1;
    }
    // copy elements in the lower triangle to the output matrix
    j = 0;
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            IC_row[j] = row[i];
            IC_col[j] = col[i];
            IC_val[j] = val[i];
            j++;
        }
    }
    // Get the begining index of each row in the matrix
    j = 1;
    row_st_idx[0] = IC_row[0];
    size_t old_row = IC_row[0];
    for (i = 1; i < lnz_size; i++)
    {
        if (IC_row[i] > old_row)
        {
            row_st_idx[j] = i;
            old_row = IC_row[i];
            j++;
        }
    }
    // Calculate the first element
    IC_val[0] = clcg_Csqrt(IC_val[0]);
    diagonal[0] = IC_val[0];
    cuComplex dia_sum;
    dia_sum.x = 0.0; dia_sum.y = 0.0;
    // The first one is already calculated
    for (i = 1; i < lnz_size; i++)
    {
        // Calculate the first column if there is one
        if (IC_col[i] == 0)
        {
            IC_val[i] = cuCdivf(IC_val[i], IC_val[0]);
            dia_sum = clcg_Csum(dia_sum, cuCmulf(IC_val[i], IC_val[i]));
            continue; // Case 1 break
        }
        // Calculate elements in the middle of a row
        if (IC_row[i] > IC_col[i])
        {
            // Find needed values from previous elements
            f = 0;
            j = row_st_idx[IC_col[i]];
            while (IC_col[j] < IC_col[i])
            {
                tmp_row[IC_col[j]] = IC_val[j];
                filled_idx[f]  = IC_col[j];
                f++;
                j++;
            }
            j = row_st_idx[IC_row[i]];
            while (IC_col[j] < IC_col[i])
            {
                IC_val[i] = clcg_Cdiff(IC_val[i], cuCmulf(IC_val[j], tmp_row[IC_col[j]]));
                j++;
            }
            IC_val[i] = cuCdivf(IC_val[i], diagonal[IC_col[i]]);
            dia_sum = clcg_Csum(dia_sum, cuCmulf(IC_val[i], IC_val[i]));
            // reset tmp variables
            for (j = 0; j < f; j++)
            {
                tmp_row[filled_idx[j]].x = 0.0; tmp_row[filled_idx[j]].y = 0.0;
            }
            continue; // Case 2 break
        }
        // We have rearched the diagonal position
        if (IC_row[i] == IC_col[i])
        {
            IC_val[i] = clcg_Csqrt(clcg_Cdiff(IC_val[i], dia_sum));
            diagonal[IC_col[i]] = IC_val[i];
            dia_sum.x = 0.0; dia_sum.y = 0.0;
        }
    }
    delete[] diagonal;
    delete[] tmp_row;
    delete[] row_st_idx;
    delete[] filled_idx;
    return;
 }
 void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, 
    int lnz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val)
 {
    // We use this to store diagonal elements of the factorizated lower triangular matrix
    cuDoubleComplex *diagonal = new cuDoubleComplex [N];
    // A temporary row
    cuDoubleComplex *tmp_row = new cuDoubleComplex [N];
    // index of non-zero elements in tmp_row
    int *filled_idx = new int [N];
    // Begining index of each row in the input matrix
    int *row_st_idx = new int [N];
    size_t i, j, f;
    // Set initial values
    for (i = 0; i < N; i++)
    {
        diagonal[i].x = 0.0; diagonal[i].y = 0.0;
        tmp_row[i].x = 0.0; tmp_row[i].y = 0.0;
        filled_idx[i] = -1;
        row_st_idx[i] = -1;
    }
    // copy elements in the lower triangle to the output matrix
    j = 0;
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            IC_row[j] = row[i];
            IC_col[j] = col[i];
            IC_val[j] = val[i];
            j++;
        }
    }
    // Get the begining index of each row in the matrix
    j = 1;
    row_st_idx[0] = IC_row[0];
    size_t old_row = IC_row[0];
    for (i = 1; i < lnz_size; i++)
    {
        if (IC_row[i] > old_row)
        {
            row_st_idx[j] = i;
            old_row = IC_row[i];
            j++;
        }
    }
    // Calculate the first element
    IC_val[0] = clcg_Zsqrt(IC_val[0]);
    diagonal[0] = IC_val[0];
    cuDoubleComplex dia_sum;
    dia_sum.x = 0.0; dia_sum.y = 0.0;
    // The first one is already calculated
    for (i = 1; i < lnz_size; i++)
    {
        // Calculate the first column if there is one
        if (IC_col[i] == 0)
        {
            IC_val[i] = cuCdiv(IC_val[i], IC_val[0]);
            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[i], IC_val[i]));
            continue; // Case 1 break
        }
        // Calculate elements in the middle of a row
        if (IC_row[i] > IC_col[i])
        {
            // Find needed values from previous elements
            f = 0;
            j = row_st_idx[IC_col[i]];
            while (IC_col[j] < IC_col[i])
            {
                tmp_row[IC_col[j]] = IC_val[j];
                filled_idx[f]  = IC_col[j];
                f++;
                j++;
            }
            j = row_st_idx[IC_row[i]];
            while (IC_col[j] < IC_col[i])
            {
                IC_val[i] = clcg_Zdiff(IC_val[i], cuCmul(IC_val[j], tmp_row[IC_col[j]]));
                j++;
            }
            IC_val[i] = cuCdiv(IC_val[i], diagonal[IC_col[i]]);
            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[i], IC_val[i]));
            // reset tmp variables
            for (j = 0; j < f; j++)
            {
                tmp_row[filled_idx[j]].x = 0.0; tmp_row[filled_idx[j]].y = 0.0;
            }
            continue; // Case 2 break
        }
        // We have rearched the diagonal position
        if (IC_row[i] == IC_col[i])
        {
            IC_val[i] = clcg_Zsqrt(clcg_Zdiff(IC_val[i], dia_sum));
            diagonal[IC_col[i]] = IC_val[i];
            dia_sum.x = 0.0; dia_sum.y = 0.0;
        }
    }
    delete[] diagonal;
    delete[] tmp_row;
    delete[] row_st_idx;
    delete[] filled_idx;
    return;
 }
 void clcg_incomplete_Cholesky_cuda_full(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val)
 {
    // We use this to store diagonal elements of the factorizated lower triangular matrix
    cuDoubleComplex *diagonal = new cuDoubleComplex [N];
    // A temporary row
    cuDoubleComplex *tmp_row = new cuDoubleComplex [N];
    // index of non-zero elements in tmp_row
    int *filled_idx = new int [N];
    // Begining index of each row in the input matrix
    int *row_st_idx = new int [N];
    size_t i, j, f, l;
    // Set initial values
    for (i = 0; i < N; i++)
    {
        diagonal[i].x = 0.0; diagonal[i].y = 0.0;
        tmp_row[i].x = 0.0; tmp_row[i].y = 0.0;
        filled_idx[i] = -1;
        row_st_idx[i] = -1;
    }
    // copy elements to the output matrix
    for (i = 0; i < nz_size; i++)
    {
        IC_row[i] = row[i];
        IC_col[i] = col[i];
        IC_val[i] = val[i];
    }
    // count element number in the lower triangular part (including the diagonal) and the upper triangular part (excluding the diagonal)
    // build map from elements' cooridnate to their index in the array
    size_t order, L_nz = 0;
    std::map<size_t, size_t> index_map;
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i]) // Count number for thr lower triangular part
        {
            L_nz++;
        }
        else // Only need to build the map for the upper triangular part
        {
            order = N*row[i] + col[i];
            index_map[order] = i;
        }
    }
    // We use to store element index in the lower triangle
    j = 0;
    size_t *low_idx = new size_t [L_nz];
    for (i = 0; i < nz_size; i++)
    {
        if (row[i] >= col[i])
        {
            low_idx[j] = i;
            j++;
        }
    }
    // Get the begining index of each row in the matrix
    j = 1;
    row_st_idx[0] = IC_row[0];
    size_t old_row = IC_row[0];
    for (i = 1; i < nz_size; i++)
    {
        if (IC_row[i] > old_row)
        {
            row_st_idx[j] = i;
            old_row = IC_row[i];
            j++;
        }
    }
    // Calculate the first element
    IC_val[0] = clcg_Zsqrt(IC_val[0]);
    diagonal[0] = IC_val[0];
    cuDoubleComplex dia_sum;
    dia_sum.x = 0.0; dia_sum.y = 0.0;
    // The first one is already calculated
    for (i = 1; i < L_nz; i++)
    {
        l = low_idx[i];
        // Calculate the first column if there is one
        if (IC_col[l] == 0)
        {
            IC_val[l] = cuCdiv(IC_val[l], IC_val[0]);
            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[l], IC_val[l]));
            // Set value at the upper triangle
            order = IC_row[l];
            IC_val[index_map[order]] = IC_val[l];
            continue; // Case 1 break
        }
        // Calculate elements in the middle of a row
        if (IC_row[l] > IC_col[l])
        {
            // Find needed values from previous elements
            f = 0;
            j = row_st_idx[IC_col[l]];
            while (IC_col[j] < IC_col[l])
            {
                tmp_row[IC_col[j]] = IC_val[j];
                filled_idx[f]  = IC_col[j];
                f++;
                j++;
            }
            j = row_st_idx[IC_row[l]];
            while (IC_col[j] < IC_col[l])
            {
                IC_val[l] = clcg_Zdiff(IC_val[l], cuCmul(IC_val[j], tmp_row[IC_col[j]]));
                j++;
            }
            IC_val[l] = cuCdiv(IC_val[l], diagonal[IC_col[l]]);
            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[l], IC_val[l]));
            // Set value at the upper triangle
            order = N*IC_col[l] + IC_row[l];
            IC_val[index_map[order]] = IC_val[l];
            // reset tmp variables
            for (j = 0; j < f; j++)
            {
                tmp_row[filled_idx[j]].x = 0.0; tmp_row[filled_idx[j]].y = 0.0;
            }
            continue; // Case 2 break
        }
        // We have rearched the diagonal position
        if (IC_row[l] == IC_col[l])
        {
            IC_val[l] = clcg_Zsqrt(clcg_Zdiff(IC_val[l], dia_sum));
            diagonal[IC_col[l]] = IC_val[l];
            dia_sum.x = 0.0; dia_sum.y = 0.0;
        }
    }
    delete[] diagonal;
    delete[] tmp_row;
    delete[] row_st_idx;
    delete[] filled_idx;
    delete[] low_idx;
    index_map.clear();
    return;
 }
--- a/src/lib/preconditioner_cuda.h
+++ b/src/lib/preconditioner_cuda.h
@@ -0,0 +1,92 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _PRECONDITIONER_CUDA_H
 #define _PRECONDITIONER_CUDA_H
 #include "lcg_complex_cuda.h"
 #ifdef LibLCG_CUDA
 /**
 * @brief Return the number of non-zero elements in the lower triangular part of the input matrix
 * 
 * @param row[in]        Row index of the input sparse matrix.
 * @param col[in]        Column index of the input sparse matrix.
 * @param nz_size[in]    Length of the non-zero elements.
 * @param lnz_size[out]  Legnth of the non-zero elements in the lower triangle
 */
 void clcg_incomplete_Cholesky_cuda_half_buffsize(const int *row, const int *col, int nz_size, int *lnz_size);
 /**
 * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
 * 
 * @note  Only the factorized lower triangular matrix is stored in the lower part of the output matrix accordingly.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param val        Non-zero values of the input sparse matrix.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zero elements.
 * @param lnz_size   Legnth of the non-zero elements in the lower triangle
 * @param IC_row     Row index of the factorized triangular sparse matrix.
 * @param IC_col     Column index of the factorized triangular sparse matrix.
 * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
 */
 void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuComplex *val, int N, int nz_size, int lnz_size, int *IC_row, int *IC_col, cuComplex *IC_val);
 /**
 * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
 * 
 * @note  Only the factorized lower triangular matrix is stored in the lower part of the output matrix accordingly.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param val        Non-zero values of the input sparse matrix.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zero elements.
 * @param lnz_size   Legnth of the non-zero elements in the lower triangle
 * @param IC_row     Row index of the factorized triangular sparse matrix.
 * @param IC_col     Column index of the factorized triangular sparse matrix.
 * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
 */
 void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, int lnz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val);
 /**
 * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
 * 
 * @note  The factorized lower and upper triangular matrixes are stored in the lower and upper triangular parts of the output matrix accordingly.
 * 
 * @param row        Row index of the input sparse matrix.
 * @param col        Column index of the input sparse matrix.
 * @param val        Non-zero values of the input sparse matrix.
 * @param N          Row/Column size of the sparse matrix.
 * @param nz_size    Length of the non-zeor elements.
 * @param IC_row     Row index of the factorized triangular sparse matrix.
 * @param IC_col     Column index of the factorized triangular sparse matrix.
 * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
 */
 void clcg_incomplete_Cholesky_cuda_full(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val);
 #endif // LibLCG_CUDA
 #endif // _PRECONDITIONER_CUDA_H
--- a/src/lib/preconditioner_eigen.cpp
+++ b/src/lib/preconditioner_eigen.cpp
--- a/src/lib/preconditioner_eigen.h
+++ b/src/lib/preconditioner_eigen.h
@@ -0,0 +1,159 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _PRECONDITIONER_EIGEN_H
 #define _PRECONDITIONER_EIGEN_H
 #include "complex"
 #include "Eigen/Dense"
 #include "Eigen/SparseCore"
 /**
 * @brief     Perform the Cholesky decomposition and return the lower triangular matrix.
 * 
 * @note      This could serve as a direct solver.
 * 
 * @param A   The input matrix. Must be full rank and symmetric (aka. A = A^T)
 * @param L   The output low triangular matrix
 */
 void lcg_Cholesky(const Eigen::MatrixXd &A, Eigen::MatrixXd &L);
 /**
 * @brief      Perform the Cholesky decomposition and return the lower triangular matrix
 * 
 * @note       This could serve as a direct solver.
 *
 * @param[in]  A     The input matrix. Must be full rank and symmetric (aka. A = A^T)
 * @param      L     The output low triangular matrix
 */
 void clcg_Cholesky(const Eigen::MatrixXcd &A, Eigen::MatrixXcd &L);
 /**
 * @brief      Calculate the invert of a lower triangle matrix (Full rank only).
 *
 * @param      L     The operating lower triangle matrix
 * @param      Linv  The inverted lower triangle matrix
 */
 void lcg_invert_lower_triangle(const Eigen::MatrixXd &L, Eigen::MatrixXd &Linv);
 /**
 * @brief      Calculate the invert of a upper triangle matrix (Full rank only).
 *
 * @param      U     The operating upper triangle matrix
 * @param      Uinv  The inverted upper triangle matrix
 */
 void lcg_invert_upper_triangle(const Eigen::MatrixXd &U, Eigen::MatrixXd &Uinv);
 /**
 * @brief      Calculate the invert of a lower triangle matrix (Full rank only).
 *
 * @param      L     The operating lower triangle matrix
 * @param      Linv  The inverted lower triangle matrix
 */
 void clcg_invert_lower_triangle(const Eigen::MatrixXcd &L, Eigen::MatrixXcd &Linv);
 /**
 * @brief      Calculate the invert of a upper triangle matrix (Full rank only).
 *
 * @param      U     The operating upper triangle matrix
 * @param      Uinv  The inverted upper triangle matrix
 */
 void clcg_invert_upper_triangle(const Eigen::MatrixXcd &U, Eigen::MatrixXcd &Uinv);
 /**
 * @brief      Calculate the incomplete Cholesky decomposition and return the lower triangular matrix
 *
 * @param[in]  A     The input sparse matrix. Must be full rank and symmetric (aka. A = A^T)
 * @param      L     The output lower triangular matrix
 * @param      fill  The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
 */
 void lcg_incomplete_Cholesky(const Eigen::SparseMatrix<double, Eigen::RowMajor> &A, Eigen::SparseMatrix<double, Eigen::RowMajor> &L, size_t fill = 0);
 /**
 * @brief      Calculate the incomplete Cholesky decomposition and return the lower triangular matrix
 *
 * @param[in]  A     The input sparse matrix. Must be full rank and symmetric (aka. A = A^T)
 * @param      L     The output lower triangular matrix
 * @param      fill  The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
 */
 void clcg_incomplete_Cholesky(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &A, Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &L, size_t fill = 0);
 /**
 * @brief        Calculate the incomplete LU factorizations
 * 
 * @param A      The input sparse matrix. Must be full rank.
 * @param L      The output lower triangular matrix.
 * @param U      The output upper triangular matrix.
 * @param fill   The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
 */
 void lcg_incomplete_LU(const Eigen::SparseMatrix<double, Eigen::RowMajor> &A, Eigen::SparseMatrix<double, Eigen::RowMajor> &L, Eigen::SparseMatrix<double, Eigen::RowMajor> &U, size_t fill = 0);
 /**
 * @brief        Calculate the incomplete LU factorizations
 * 
 * @param A      The input sparse matrix. Must be full rank.
 * @param L      The output lower triangular matrix.
 * @param U      The output upper triangular matrix.
 * @param fill   The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
 */
 void clcg_incomplete_LU(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &A, Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &L, 
    Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &U, size_t fill = 0);
 /**
 * @brief    Solve the linear system Lx = B, in which L is a lower triangle matrix.
 * 
 * @param L  The input lower triangle matrix
 * @param B  The object vector
 * @param X  The solution vector
 */
 void lcg_solve_lower_triangle(const Eigen::SparseMatrix<double, Eigen::RowMajor> &L, const Eigen::VectorXd &B, Eigen::VectorXd &X);
 /**
 * @brief    Solve the linear system Ux = B, in which U is a upper triangle matrix.
 * 
 * @param U  The input upper triangle matrix
 * @param B  The object vector
 * @param X  The solution vector
 */
 void lcg_solve_upper_triangle(const Eigen::SparseMatrix<double, Eigen::RowMajor> &U, const Eigen::VectorXd &B, Eigen::VectorXd &X);
 /**
 * @brief    Solve the linear system Lx = B, in which L is a lower triangle matrix.
 * 
 * @param L  The input lower triangle matrix
 * @param B  The object vector
 * @param X  The solution vector
 */
 void clcg_solve_lower_triangle(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &L, const Eigen::VectorXcd &B, Eigen::VectorXcd &X);
 /**
 * @brief    Solve the linear system Ux = B, in which U is a upper triangle matrix.
 * 
 * @param U  The input upper triangle matrix
 * @param B  The object vector
 * @param X  The solution vector
 */
 void clcg_solve_upper_triangle(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &U, const Eigen::VectorXcd &B, Eigen::VectorXcd &X);
 #endif // _PRECONDITIONER_EIGEN_H
--- a/src/lib/solver.cpp
+++ b/src/lib/solver.cpp
@@ -0,0 +1,311 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "solver.h"
 #include "ctime"
 #include "iostream"
 #include "config.h"
 #ifdef LibLCG_OPENMP
 #include "omp.h"
 #endif
 LCG_Solver::LCG_Solver()
 {
 	param_ = lcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int LCG_Solver::Progress(const lcg_float* m, const lcg_float converge, 
 	const lcg_para *param, const int n_size, const int k)
 {
 	if (inter_ > 0 && k%inter_ == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void LCG_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void LCG_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void LCG_Solver::set_lcg_parameter(const lcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void LCG_Solver::Minimize(lcg_float *m, const lcg_float *b, int x_size, 
 	lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver(_AxProduct, nullptr, m, b, x_size, &param_, this, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = lcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = lcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_CG:
 				std::clog << "Solver: CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_CGS:
 				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_BICGSTAB:
 				std::clog << "Solver: BICGSTAB. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_BICGSTAB2:
 				std::clog << "Solver: BICGSTAB2. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void LCG_Solver::MinimizePreconditioned(lcg_float *m, const lcg_float *b, int x_size, 
 	lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_preconditioned(_AxProduct, _MxProduct, nullptr, m, b, x_size, &param_, this, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = lcg_solver_preconditioned(_AxProduct, _MxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = lcg_solver_preconditioned(_AxProduct, _MxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_PCG:
 				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void LCG_Solver::MinimizeConstrained(lcg_float *m, const lcg_float *b, const lcg_float* low, 
 	const lcg_float *hig, int x_size, lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_constrained(_AxProduct, nullptr, m, b, low, hig, x_size, &param_, this, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = lcg_solver_constrained(_AxProduct, _Progress, m, b, low, hig, x_size, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = lcg_solver_constrained(_AxProduct, _Progress, m, b, low, hig, x_size, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_PG:
 				std::clog << "Solver: PG-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_SPG:
 				std::clog << "Solver: SPG-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 CLCG_Solver::CLCG_Solver()
 {
 	param_ = clcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int CLCG_Solver::Progress(const lcg_complex* m, const lcg_float converge, 
 	const clcg_para* param, const int n_size, const int k)
 {
 	if (inter_ > 0 && k%inter_ == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void CLCG_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void CLCG_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void CLCG_Solver::set_clcg_parameter(const clcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void CLCG_Solver::Minimize(lcg_complex *m, const lcg_complex *b, int x_size, 
 	clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver(_AxProduct, nullptr, m, b, x_size, &param_, this, solver_id);
 		if (ret < 0) clcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = clcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = clcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_BICG:
 				std::clog << "Solver: Bi-CG. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_BICG_SYM:
 				std::clog << "Solver: Bi-CG (symmetrically accelerated). Times cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_CGS:
 				std::clog << "Solver: CGS. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_TFQMR:
 				std::clog << "Solver: TFQMR. Times cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Times cost: " << costime << " ms" << std::endl;
 				break;
 		}
 	}
 	if (verbose) clcg_error_str(ret, er_throw);
 	else if (ret < 0) clcg_error_str(ret, er_throw);
 	return;
 }
--- a/src/lib/solver.h
+++ b/src/lib/solver.h
@@ -0,0 +1,285 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _SOLVER_H
 #define _SOLVER_H
 #include "lcg.h"
 #include "clcg.h"
 /**
 * @brief      Linear conjugate gradient solver class
 */
 class LCG_Solver
 {
 protected:
 	lcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	LCG_Solver();
 	virtual ~LCG_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param a[in]      Pointer of the multiplier
 	 * @param b[out]     Pointer of the product
 	 * @param num        Size of the array
 	 */
 	static void _AxProduct(void* instance, const lcg_float* a, lcg_float* b, const int num)
 	{
 		return reinterpret_cast<LCG_Solver*>(instance)->AxProduct(a, b, num);
 	}
 	/**
 	 * @brief       Virtual function of the product of A*x
 	 * 
 	 * @param a[in]     Pointer of the multiplier
 	 * @param b[out]    Pointer of the product
 	 * @param num   Size of the array
 	 */
 	virtual void AxProduct(const lcg_float* a, lcg_float* b, const int num) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param a[in]      Pointer of the multiplier
 	 * @param b[out]     Pointer of the product
 	 * @param num        Size of the array
 	 */
 	static void _MxProduct(void* instance, const lcg_float* a, lcg_float* b, const int num)
 	{
 		return reinterpret_cast<LCG_Solver*>(instance)->MxProduct(a, b, num);
 	}
 	/**
 	 * @brief       Virtual function of the product of M^-1*x
 	 * 
 	 * @param a[in]     Pointer of the multiplier
 	 * @param b[out]    Pointer of the product
 	 * @param num   Size of the array
 	 */
 	virtual void MxProduct(const lcg_float* a, lcg_float* b, const int num) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const lcg_float* m, const lcg_float converge, 
 		const lcg_para *param, const int n_size, const int k)
 	{
 		return reinterpret_cast<LCG_Solver*>(instance)->Progress(m, converge, param, n_size, k);
 	}
 	/**
 	 * @brief       Virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const lcg_float* m, const lcg_float converge, 
 		const lcg_para *param, const int n_size, const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the parameters of the algorithms
 	 * 
 	 * @param in_param   the input parameters
 	 */
 	void set_lcg_parameter(const lcg_para &in_param);
 	/**
 	 * @brief      Run the minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param x_size     Size of the solution vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(lcg_float *m, const lcg_float *b, int x_size, 
 		lcg_solver_enum solver_id = LCG_CG, bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the preconitioned minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param x_size     Size of the solution vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizePreconditioned(lcg_float *m, const lcg_float *b, int x_size, 
 		lcg_solver_enum solver_id = LCG_PCG, bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the constrained minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param low        Lower bound of the solution vector
 	 * @param hig        Higher bound of the solution vector
 	 * @param x_size     Size of the solution vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizeConstrained(lcg_float *m, const lcg_float *b, const lcg_float* low, 
 		const lcg_float *hig, int x_size, lcg_solver_enum solver_id = LCG_PG, 
 		bool verbose = true, bool er_throw = false);
 };
 /**
 * @brief      Complex linear conjugate gradient solver class
 */
 class CLCG_Solver
 {
 protected:
 	clcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	CLCG_Solver();
 	virtual ~CLCG_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param x_size     Size of the array
 	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
 	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
 	 */
 	static void _AxProduct(void *instance, const lcg_complex *x, lcg_complex *prod_Ax, 
 		const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		return reinterpret_cast<CLCG_Solver*>(instance)->AxProduct(x, prod_Ax, x_size, layout, conjugate);
 	}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param x_size     Size of the array
 	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
 	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
 	 */
 	virtual void AxProduct(const lcg_complex *x, lcg_complex *prod_Ax, 
 		const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const lcg_complex* m, const lcg_float converge, 
 		const clcg_para* param, const int n_size, const int k)
 	{
 		return reinterpret_cast<CLCG_Solver*>(instance)->Progress(m, converge, param, n_size, k);
 	}
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const lcg_complex* m, const lcg_float converge, 
 		const clcg_para* param, const int n_size, const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the parameters of the algorithms
 	 * 
 	 * @param in_param   the input parameters
 	 */
 	void set_clcg_parameter(const clcg_para &in_param);
 	/**
 	 * @brief      Run the minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param x_size     Size of the solution vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(lcg_complex *m, const lcg_complex *b, int x_size, 
 		clcg_solver_enum solver_id = CLCG_CGS, bool verbose = true, 
 		bool er_throw = false);
 };
 #endif // _SOLVER_H
--- a/src/lib/solver_cuda.cu
+++ b/src/lib/solver_cuda.cu
@@ -0,0 +1,414 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "solver_cuda.h"
 #include "cmath"
 #include "ctime"
 #include "iostream"
 LCG_CUDA_Solver::LCG_CUDA_Solver()
 {
 	param_ = lcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int LCG_CUDA_Solver::Progress(const lcg_float* m, const lcg_float converge, 
 	const lcg_para* param, const int n_size, const int nz_size, const int k)
 {
 	if (inter_ > 0 && k%inter_ == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void LCG_CUDA_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void LCG_CUDA_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void LCG_CUDA_Solver::set_lcg_parameter(const lcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void LCG_CUDA_Solver::Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
 	const int n_size, const int nz_size, lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_cuda(_AxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = lcg_solver_cuda(_AxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_CG:
 				std::clog << "Solver: CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_CGS:
 				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void LCG_CUDA_Solver::MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
    const int n_size, const int nz_size, lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = lcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_PCG:
 				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void LCG_CUDA_Solver::MinimizeConstrained(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, const lcg_float *b, 
    const lcg_float* low, const lcg_float *hig, const int n_size, const int nz_size, lcg_solver_enum solver_id, 
    bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_constrained_cuda(_AxProduct, nullptr, x, b, low, hig, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = lcg_solver_constrained_cuda(_AxProduct, _Progress, x, b, low, hig, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_PG:
 				std::clog << "Solver: PG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 CLCG_CUDAF_Solver::CLCG_CUDAF_Solver()
 {
 	param_ = clcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int CLCG_CUDAF_Solver::Progress(const cuComplex* m, const float converge, 
 	const clcg_para* param, const int n_size, const int nz_size, const int k)
 {
 	if (inter_ > 0 && k%inter_ == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void CLCG_CUDAF_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void CLCG_CUDAF_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void CLCG_CUDAF_Solver::set_clcg_parameter(const clcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void CLCG_CUDAF_Solver::Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
 	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver_cuda(_AxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = clcg_solver_cuda(_AxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_BICG:
 				std::clog << "Solver: BI-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_BICG_SYM:
 				std::clog << "Solver: BI-CG (symmetrically accelerated). Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void CLCG_CUDAF_Solver::MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
 	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_PCG:
 				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 CLCG_CUDA_Solver::CLCG_CUDA_Solver()
 {
 	param_ = clcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int CLCG_CUDA_Solver::Progress(const cuDoubleComplex* m, const lcg_float converge, 
 	const clcg_para* param, const int n_size, const int nz_size, const int k)
 {
 	if (inter_ > 0 && k%inter_ == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void CLCG_CUDA_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void CLCG_CUDA_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void CLCG_CUDA_Solver::set_clcg_parameter(const clcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void CLCG_CUDA_Solver::Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
 	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver_cuda(_AxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = clcg_solver_cuda(_AxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_BICG:
 				std::clog << "Solver: BI-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_BICG_SYM:
 				std::clog << "Solver: BI-CG (symmetrically accelerated). Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void CLCG_CUDA_Solver::MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
 	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 	clock_t start = clock();
 	int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_PCG:
 				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
--- a/src/lib/solver_cuda.h
+++ b/src/lib/solver_cuda.h
@@ -0,0 +1,545 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _SOLVER_CUDA_H
 #define _SOLVER_CUDA_H
 #include "lcg_cuda.h"
 #include "clcg_cuda.h"
 #include "clcg_cudaf.h"
 #ifdef LibLCG_CUDA
 /**
 * @brief      Linear conjugate gradient solver class
 */
 class LCG_CUDA_Solver
 {
 protected:
 	lcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	LCG_CUDA_Solver();
 	virtual ~LCG_CUDA_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	static void _AxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
 	{
 		return reinterpret_cast<LCG_CUDA_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Ax, n_size, nz_size);
 	}
 	/**
 	 * @brief       Virtual function of the product of A*x
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]     Pointer of the multiplier
 	 * @param prod_Ax[out]    Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	virtual void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	static void _MxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, const int n_size, const int nz_size)
 	{
 		return reinterpret_cast<LCG_CUDA_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Mx, n_size, nz_size);
 	}
 	/**
 	 * @brief       Virtual function of the product of M^-1*x
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]     Pointer of the multiplier
 	 * @param prod_Mx[out]    Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	virtual void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, const int n_size, const int nz_size) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const lcg_float* m, const lcg_float converge, 
 	    const lcg_para* param, const int n_size, const int nz_size, const int k)
 	{
 		return reinterpret_cast<LCG_CUDA_Solver*>(instance)->Progress(m, converge, param, n_size, nz_size, k);
 	}
 	/**
 	 * @brief       Virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const lcg_float* m, const lcg_float converge, 
 	    const lcg_para* param, const int n_size, const int nz_size, const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the parameters of the algorithms
 	 * 
 	 * @param in_param   the input parameters
 	 */
 	void set_lcg_parameter(const lcg_para &in_param);
 	/**
 	 * @brief      Run the constrained minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
        const int n_size, const int nz_size, lcg_solver_enum solver_id = LCG_CG, bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the preconditioned minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
        const int n_size, const int nz_size, lcg_solver_enum solver_id = LCG_CG, bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the constrained minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param low        Lower bound of the solution vector
 	 * @param hig        Higher bound of the solution vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
    void MinimizeConstrained(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, const lcg_float *b, 
        const lcg_float* low, const lcg_float *hig, const int n_size, const int nz_size, lcg_solver_enum solver_id = LCG_PG, 
        bool verbose = true, bool er_throw = false);
 };
 /**
 * @brief      Complex linear conjugate gradient solver class
 */
 class CLCG_CUDAF_Solver
 {
 protected:
 	clcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	CLCG_CUDAF_Solver();
 	virtual ~CLCG_CUDAF_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	static void _AxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		return reinterpret_cast<CLCG_CUDAF_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Ax, n_size, nz_size, oper_t);
 	}
 	/**
 	 * @brief       Virtual function of the product of A*x
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	virtual void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	static void _MxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		return reinterpret_cast<CLCG_CUDAF_Solver*>(instance)->MxProduct(cub_handle, cus_handle, x, prod_Mx, n_size, nz_size, oper_t);
 	}
 	/**
 	 * @brief       Virtual function of the product of M^-1*x
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	virtual void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const cuComplex* m, const float converge, 
 	    const clcg_para* param, const int n_size, const int nz_size, const int k)
 	{
 		return reinterpret_cast<CLCG_CUDAF_Solver*>(instance)->Progress(m, converge, param, n_size, nz_size, k);
 	}
 	/**
 	 * @brief       Virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const cuComplex* m, const float converge, 
 	    const clcg_para* param, const int n_size, const int nz_size, const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the parameters of the algorithms
 	 * 
 	 * @param in_param   the input parameters
 	 */
 	void set_clcg_parameter(const clcg_para &in_param);
 	/**
 	 * @brief      Run the constrained minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
 		const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_BICG, bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the preconditioned minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
        const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_PCG, bool verbose = true, bool er_throw = false);
 };
 /**
 * @brief      Complex linear conjugate gradient solver class
 */
 class CLCG_CUDA_Solver
 {
 protected:
 	clcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	CLCG_CUDA_Solver();
 	virtual ~CLCG_CUDA_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	static void _AxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		return reinterpret_cast<CLCG_CUDA_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Ax, n_size, nz_size, oper_t);
 	}
 	/**
 	 * @brief       Virtual function of the product of A*x
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	virtual void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	static void _MxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		return reinterpret_cast<CLCG_CUDA_Solver*>(instance)->MxProduct(cub_handle, cus_handle, x, prod_Mx, n_size, nz_size, oper_t);
 	}
 	/**
 	 * @brief       Virtual function of the product of M^-1*x
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 */
 	virtual void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
 	    const clcg_para* param, const int n_size, const int nz_size, const int k)
 	{
 		return reinterpret_cast<CLCG_CUDA_Solver*>(instance)->Progress(m, converge, param, n_size, nz_size, k);
 	}
 	/**
 	 * @brief       Virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param n_size      Size of the solution
 	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const cuDoubleComplex* m, const lcg_float converge, 
 	    const clcg_para* param, const int n_size, const int nz_size, const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the parameters of the algorithms
 	 * 
 	 * @param in_param   the input parameters
 	 */
 	void set_clcg_parameter(const clcg_para &in_param);
 	/**
 	 * @brief      Run the constrained minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
 		const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_BICG, bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the preconditioned minimizing process
 	 * 
 	 * @param cub_handle  Handler of the CuBLAS library
 	 * @param cus_handle  Handler of the CuSparse library
 	 * @param x          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param n_size     Size of the solution vector
 	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
        const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_PCG, bool verbose = true, bool er_throw = false);
 };
 #endif // LibLCG_CUDA
 #endif // _SOLVER_CUDA_H
--- a/src/lib/solver_eigen.cpp
+++ b/src/lib/solver_eigen.cpp
@@ -0,0 +1,365 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "solver_eigen.h"
 #include "cmath"
 #include "ctime"
 #include "iostream"
 #include "config.h"
 #ifdef LibLCG_OPENMP
 #include "omp.h"
 #endif
 LCG_EIGEN_Solver::LCG_EIGEN_Solver()
 {
 	param_ = lcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int LCG_EIGEN_Solver::Progress(const Eigen::VectorXd *m, const lcg_float converge, const lcg_para *param, 
 	const int k)
 {
 	if (inter_ > 0 && k%inter_ == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void LCG_EIGEN_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void LCG_EIGEN_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void LCG_EIGEN_Solver::set_lcg_parameter(const lcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void LCG_EIGEN_Solver::Minimize(Eigen::VectorXd &m, const Eigen::VectorXd &b, 
 	lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_eigen(_AxProduct, nullptr, m, b, &param_, this, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = lcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = lcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_CG:
 				std::clog << "Solver: CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_CGS:
 				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_BICGSTAB:
 				std::clog << "Solver: BICGSTAB. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_BICGSTAB2:
 				std::clog << "Solver: BICGSTAB2. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void LCG_EIGEN_Solver::MinimizePreconditioned(Eigen::VectorXd &m, const Eigen::VectorXd &b, 
 	lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, nullptr, m, b, &param_, this, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = lcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, _Progress, m, b, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = lcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_PCG:
 				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}	
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 void LCG_EIGEN_Solver::MinimizeConstrained(Eigen::VectorXd &m, const Eigen::VectorXd &B, const Eigen::VectorXd &low, 
 	const Eigen::VectorXd &hig, lcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = lcg_solver_constrained_eigen(_AxProduct, nullptr, m, B, low, hig, &param_, this, solver_id);
 		if (ret < 0) lcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double start = omp_get_wtime();
 	int ret = lcg_solver_constrained_eigen(_AxProduct, _Progress, m, B, low, hig, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = lcg_solver_constrained_eigen(_AxProduct, _Progress, m, B, low, hig, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case LCG_PG:
 				std::clog << "Solver: PG-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case LCG_SPG:
 				std::clog << "Solver: SPG-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}
 	}
 	if (verbose) lcg_error_str(ret, er_throw);
 	else if (ret < 0) lcg_error_str(ret, er_throw);
 	return;
 }
 CLCG_EIGEN_Solver::CLCG_EIGEN_Solver()
 {
 	param_ = clcg_default_parameters();
 	inter_ = 1;
 	silent_ = false;
 }
 int CLCG_EIGEN_Solver::Progress(const Eigen::VectorXcd *m, const lcg_float converge, const clcg_para *param, 
 	const int k)
 {
 	if (inter_ > 0 && (k%inter_) == 0)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 		return 0;
 	}
 	if (converge <= param->epsilon)
 	{
 		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	}
 	return 0;
 }
 void CLCG_EIGEN_Solver::silent()
 {
 	silent_ = true;
 	return;
 }
 void CLCG_EIGEN_Solver::set_clcg_parameter(const clcg_para &in_param)
 {
 	param_ = in_param;
 	return;
 }
 void CLCG_EIGEN_Solver::set_report_interval(unsigned int inter)
 {
 	inter_ = inter;
 	return;
 }
 void CLCG_EIGEN_Solver::Minimize(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, 
 	clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver_eigen(_AxProduct, nullptr, m, b, &param_, this, solver_id);
 		if (ret < 0) clcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double  start = omp_get_wtime();
 	int ret = clcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = clcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_BICG:
 				std::clog << "Solver: BI-CG. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_BICG_SYM:
 				std::clog << "Solver: BI-CG (symmetrically accelerated). Time cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_CGS:
 				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_TFQMR:
 				std::clog << "Solver: TFQMR. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_PCG:
 				std::clog << "Solver: PCG. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_PBICG:
 				std::clog << "Solver: PBICG. Times cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}
 	}
 	if (verbose) clcg_error_str(ret, er_throw);
 	else if (ret < 0) clcg_error_str(ret, er_throw);
 	return;
 }
 void CLCG_EIGEN_Solver::MinimizePreconditioned(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, 
 	clcg_solver_enum solver_id, bool verbose, bool er_throw)
 {
 	if (silent_)
 	{
 		int ret = clcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, nullptr, m, b, &param_, this, solver_id);
 		if (ret < 0) clcg_error_str(ret, true);
 		return;
 	}
 	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
 #ifdef LibLCG_OPENMP
 	double  start = omp_get_wtime();
 	int ret = clcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, _Progress, m, b, &param_, this, solver_id);
 	double end = omp_get_wtime();
 	lcg_float costime = 1000*(end-start);
 #else
 	clock_t start = clock();
 	int ret = clcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, _Progress, m, b, &param_, this, solver_id);
 	clock_t end = clock();
 	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
 #endif
 	if (!er_throw)
 	{
 		std::clog << std::endl;
 		switch (solver_id)
 		{
 			case CLCG_PCG:
 				std::clog << "Solver: PCG. Times cost: " << costime << " ms" << std::endl;
 				break;
 			case CLCG_PBICG:
 				std::clog << "Solver: PBICG. Times cost: " << costime << " ms" << std::endl;
 				break;
 			default:
 				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
 				break;
 		}
 	}
 	if (verbose) clcg_error_str(ret, er_throw);
 	else if (ret < 0) clcg_error_str(ret, er_throw);
 	return;
 }
--- a/src/lib/solver_eigen.h
+++ b/src/lib/solver_eigen.h
@@ -0,0 +1,308 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _SOLVER_EIGEN_H
 #define _SOLVER_EIGEN_H
 #include "lcg_eigen.h"
 #include "clcg_eigen.h"
 /**
 * @brief      Linear conjugate gradient solver class
 */
 class LCG_EIGEN_Solver
 {
 protected:
 	lcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	LCG_EIGEN_Solver();
 	virtual ~LCG_EIGEN_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 */
 	static void _AxProduct(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax)
 	{
 		return reinterpret_cast<LCG_EIGEN_Solver*>(instance)->AxProduct(x, prod_Ax);
 	}
 	/**
 	 * @brief       Virtual function of the product of A*x
 	 * 
 	 * @param x[in]     Pointer of the multiplier
 	 * @param prod_Ax[out]    Pointer of the product
 	 */
 	virtual void AxProduct(const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 */
 	static void _MxProduct(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Mx)
 	{
 		return reinterpret_cast<LCG_EIGEN_Solver*>(instance)->MxProduct(x, prod_Mx);
 	}
 	/**
 	 * @brief       Virtual function of the product of M^-1*x
 	 * 
 	 * @param x[in]     Pointer of the multiplier
 	 * @param prod_Mx[out]    Pointer of the product
 	 */
 	virtual void MxProduct(const Eigen::VectorXd &x, Eigen::VectorXd &prod_Mx) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const Eigen::VectorXd *m, const lcg_float converge, 
 		const lcg_para *param, const int k)
 	{
 		return reinterpret_cast<LCG_EIGEN_Solver*>(instance)->Progress(m, converge, param, k);
 	}
 	/**
 	 * @brief       Virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const Eigen::VectorXd *m, const lcg_float converge, const lcg_para *param, 
 		const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the parameters of the algorithms
 	 * 
 	 * @param in_param   the input parameters
 	 */
 	void set_lcg_parameter(const lcg_para &in_param);
 	/**
 	 * @brief      Run the minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(Eigen::VectorXd &m, const Eigen::VectorXd &b, lcg_solver_enum solver_id = LCG_CG, 
 		bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the preconitioned minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizePreconditioned(Eigen::VectorXd &m, const Eigen::VectorXd &b, lcg_solver_enum solver_id = LCG_PCG, 
 		bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the constrained minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param low        Lower bound of the solution vector
 	 * @param hig        Higher bound of the solution vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void MinimizeConstrained(Eigen::VectorXd &m, const Eigen::VectorXd &B, const Eigen::VectorXd &low, 
 		const Eigen::VectorXd &hig, lcg_solver_enum solver_id = LCG_PG, bool verbose = true, 
 		bool er_throw = false);
 };
 /**
 * @brief      Complex linear conjugate gradient solver class
 */
 class CLCG_EIGEN_Solver
 {
 protected:
 	clcg_para param_;
 	unsigned int inter_;
 	bool silent_;
 public:
 	CLCG_EIGEN_Solver();
 	virtual ~CLCG_EIGEN_Solver(){}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
 	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
 	 */
 	static void _AxProduct(void* instance, const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
 		lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		return reinterpret_cast<CLCG_EIGEN_Solver*>(instance)->AxProduct(x, prod_Ax, layout, conjugate);
 	}
 	/**
 	 * @brief       Interface of the virtual function of the product of A*x
 	 * 
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Ax[out]     Pointer of the product
 	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
 	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
 	 */
 	virtual void AxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
 		lcg_matrix_e layout, clcg_complex_e conjugate) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param instance   User data sent to identify the function address
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
 	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
 	 */
    static void _MxProduct(void* instance, const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Mx, 
        lcg_matrix_e layout, clcg_complex_e conjugate)
    {
        return reinterpret_cast<CLCG_EIGEN_Solver*>(instance)->MxProduct(x, prod_Mx, layout, conjugate);
    }
 	/**
 	 * @brief       Interface of the virtual function of the product of M^-1*x
 	 * 
 	 * @param x[in]      Pointer of the multiplier
 	 * @param prod_Mx[out]     Pointer of the product
 	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
 	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
 	 */
    virtual void MxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Mx, 
        lcg_matrix_e layout, clcg_complex_e conjugate) = 0;
 	/**
 	 * @brief       Interface of the virtual function of the process monitoring
 	 * 
 	 * @param instance    User data sent to identify the function address
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	static int _Progress(void* instance, const Eigen::VectorXcd *m, const lcg_float converge, 
 		const clcg_para *param, const int k)
 	{
 		return reinterpret_cast<CLCG_EIGEN_Solver*>(instance)->Progress(m, converge, param, k);
 	}
 	/**
 	 * @brief       Virtual function of the process monitoring
 	 * 
 	 * @param m           Pointer of the current solution
 	 * @param converge    Current value of the convergence
 	 * @param param       Pointer of the parameters used in the algorithms
 	 * @param k           Current iteration times
 	 * @return int        Status of the process
 	 */
 	virtual int Progress(const Eigen::VectorXcd *m, const lcg_float converge, const clcg_para *param, 
 		const int k);
 	/**
 	 * @brief      Do not report any processes
 	 */
 	void silent();
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_report_interval(unsigned int inter);
 	/**
 	 * @brief      Set the interval to run the process monitoring function
 	 * 
 	 * @param inter      the interval
 	 */
 	void set_clcg_parameter(const clcg_para &in_param);
 	/**
 	 * @brief      Run the minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
 	void Minimize(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, clcg_solver_enum solver_id = CLCG_CGS, 
 		bool verbose = true, bool er_throw = false);
 	/**
 	 * @brief      Run the preconitioned minimizing process
 	 * 
 	 * @param m          Pointer of the solution vector
 	 * @param b          Pointer of the targeting vector
 	 * @param solver_id  Solver type
 	 * @param verbose    Report more information of the full process
 	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
 	 */
    void MinimizePreconditioned(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, clcg_solver_enum solver_id = CLCG_PBICG, 
        bool verbose = true, bool er_throw = false);
 };
 #endif // _SOLVER_EIGEN_H
--- a/src/lib/util.cpp
+++ b/src/lib/util.cpp
@@ -0,0 +1,253 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "iostream"
 #include "exception"
 #include "stdexcept"
 #include "util.h"
 #if defined _WINDOWS || __WIN32__
 #include "windows.h"
 #endif
 lcg_para lcg_default_parameters()
 {
 	lcg_para param = defparam;
 	return param;
 }
 lcg_solver_enum lcg_select_solver(std::string slr_char)
 {
 	lcg_solver_enum slr_id;
 	if (slr_char == "LCG_CG") slr_id = LCG_CG;
 	else if (slr_char == "LCG_PCG") slr_id = LCG_PCG;
 	else if (slr_char == "LCG_CGS") slr_id = LCG_CGS;
 	else if (slr_char == "LCG_BICGSTAB") slr_id = LCG_BICGSTAB;
 	else if (slr_char == "LCG_BICGSTAB2") slr_id = LCG_BICGSTAB2;
 	else if (slr_char == "LCG_PG") slr_id = LCG_PG;
 	else if (slr_char == "LCG_SPG") slr_id = LCG_SPG;
 	else throw std::invalid_argument("Invalid solver type.");
 	return slr_id;
 }
 void lcg_error_str(int er_index, bool er_throw)
 {
 #if defined _WINDOWS || __WIN32__
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_GREEN);
 			std::cerr << "Success! ";
 		}
 		else
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_RED);
 			std::cerr << "Fail! ";
 		}
 	}
 #else
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 			std::cerr << "\033[1m\033[32mSuccess! ";
 		else
 			std::cerr << "\033[1m\033[31mFail! ";
 	}
 #endif
 	std::string err_str;
 	switch (er_index)
 	{
 		case LCG_SUCCESS:
 			err_str = "Iteration reached convergence."; break;
 		case LCG_STOP:
 			err_str = "Iteration is stopped by the progress evaluation function."; break;
 		case LCG_ALREADY_OPTIMIZIED:
 			err_str = "The variables are already optimized."; break;
 		case LCG_UNKNOWN_ERROR:
 			err_str = "Unknown error."; break;
 		case LCG_INVILAD_VARIABLE_SIZE:
 			err_str = "The size of the variables is negative."; break;
 		case LCG_INVILAD_MAX_ITERATIONS:
 			err_str = "The maximal iteration times can't be negative."; break;
 		case LCG_INVILAD_EPSILON:
 			err_str = "The epsilon is not in the range (0, 1)."; break;
 		case LCG_INVILAD_RESTART_EPSILON:
 			err_str = "The restart threshold can't be negative."; break;
 		case LCG_REACHED_MAX_ITERATIONS:
 			err_str = "The maximal iteration has been reached."; break;
 		case LCG_NULL_PRECONDITION_MATRIX:
 			err_str = "The precondition matrix can't be null."; break;
 		case LCG_NAN_VALUE:
 			err_str = "The model values are NaN."; break;
 		case LCG_INVALID_POINTER:
 			err_str = "Invalid pointer."; break;
 		case LCG_INVALID_LAMBDA:
 			err_str = "Invalid value for lambda."; break;
 		case LCG_INVALID_SIGMA:
 			err_str = "Invalid value for sigma."; break;
 		case LCG_INVALID_BETA:
 			err_str = "Invalid value for beta."; break;
 		case LCG_INVALID_MAXIM:
 			err_str = "Invalid value for maxi_m."; break;
 		case LCG_SIZE_NOT_MATCH:
 			err_str = "The sizes of solution and target do not match."; break;
 		default:
 			err_str = "Unknown error."; break;
 	}
 	if (er_throw && er_index < 0) throw  std::runtime_error(err_str.c_str());
 	else std::cerr << err_str;
 #if defined _WINDOWS || __WIN32__
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
 			std::cerr << std::endl;
 		}
 		else
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
 			std::cerr << std::endl;
 		}	
 	}
 #else
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 			std::cerr << "\033[0m" << std::endl;
 		else
 			std::cerr << "\033[0m" << std::endl;	
 	}
 #endif
 	return;
 }
 clcg_para clcg_default_parameters()
 {
 	clcg_para param = defparam2;
 	return param;
 }
 clcg_solver_enum clcg_select_solver(std::string slr_char)
 {
 	clcg_solver_enum slr_id;
 	if (slr_char == "CLCG_BICG") slr_id = CLCG_BICG;
 	else if (slr_char == "CLCG_BICG_SYM") slr_id = CLCG_BICG_SYM;
 	else if (slr_char == "CLCG_CGS") slr_id = CLCG_CGS;
 	else if (slr_char == "CLCG_TFQMR") slr_id = CLCG_TFQMR;
 	else throw std::invalid_argument("Invalid solver type.");
 	return slr_id;
 }
 void clcg_error_str(int er_index, bool er_throw)
 {
 #if defined _WINDOWS || __WIN32__
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_GREEN);
 			std::cerr << "Success! ";
 		}
 		else
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_RED);
 			std::cerr << "Fail! ";
 		}	
 	}
 #else
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 			std::cerr << "\033[1m\033[32mSuccess! ";
 		else
 			std::cerr << "\033[1m\033[31mFail! ";
 	}
 #endif
 	std::string err_str;
 	switch (er_index)
 	{
 		case CLCG_SUCCESS:
 			err_str = "Iteration reached convergence."; break;
 		case CLCG_STOP:
 			err_str = "Iteration is stopped by the progress evaluation function."; break;
 		case CLCG_ALREADY_OPTIMIZIED:
 			err_str = "The variables are already optimized."; break;
 		case CLCG_UNKNOWN_ERROR:
 			err_str = "Unknown error."; break;
 		case CLCG_INVILAD_VARIABLE_SIZE:
 			err_str = "The size of the variables is negative."; break;
 		case CLCG_INVILAD_MAX_ITERATIONS:
 			err_str = "The maximal iteration times is negative."; break;
 		case CLCG_INVILAD_EPSILON:
 			err_str = "The epsilon is not in the range (0, 1)."; break;
 		case CLCG_REACHED_MAX_ITERATIONS:
 			err_str = "The maximal iteration has been reached."; break;
 		case CLCG_NAN_VALUE:
 			err_str = "The model values are NaN."; break;
 		case CLCG_INVALID_POINTER:
 			err_str = "Invalid pointer."; break;
 		case CLCG_SIZE_NOT_MATCH:
 			err_str = "The sizes of the solution and target do not match."; break;
 		case CLCG_UNKNOWN_SOLVER:
 			err_str = "Unknown solver."; break;
 		default:
 			err_str = "Unknown error."; break;
 	}
 	if (er_throw && er_index < 0) throw std::runtime_error(err_str.c_str());
 	else std::cerr << err_str;
 #if defined _WINDOWS || __WIN32__
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
 			std::cerr << std::endl;
 		}
 		else
 		{
 			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
 			std::cerr << std::endl;
 		}	
 	}
 #else
 	if (!er_throw)
 	{
 		if (er_index >= 0)
 			std::cerr << "\033[0m" << std::endl;
 		else
 			std::cerr << "\033[0m" << std::endl;	
 	}
 #endif
 	return;
 }
--- a/src/lib/util.h
+++ b/src/lib/util.h
@@ -0,0 +1,308 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #ifndef _LCG_UTIL_H
 #define _LCG_UTIL_H
 #include "string"
 #include "algebra.h"
 /**
 * @brief      Types of method that could be recognized by the lcg_solver() function.
 */
 enum lcg_solver_enum
 {
 	/**
 	 * Conjugate gradient method.
 	 */
 	LCG_CG,
 	/**
 	 * Preconditioned conjugate gradient method.
 	 */
 	LCG_PCG,
 	/**
 	 * Conjugate gradient squared method.
 	 */
 	LCG_CGS,
 	/**
 	 * Biconjugate gradient method.
 	 */
 	LCG_BICGSTAB,
 	/**
 	 * Biconjugate gradient method with restart.
 	 */
 	LCG_BICGSTAB2,
 	/**
 	 * Conjugate gradient method with projected gradient for inequality constraints.
 	 * This algorithm comes without non-monotonic linear search for the step length.
 	 */
 	LCG_PG,
 	/**
 	 * Conjugate gradient method with spectral projected gradient for inequality constraints.
 	 * This algorithm comes with non-monotonic linear search for the step length.
 	 */
 	LCG_SPG,
 };
 /**
 * @brief      return value of the lcg_solver() function
 */
 enum lcg_return_enum
 {
 	LCG_SUCCESS = 0, ///< The solver function terminated successfully.
 	LCG_CONVERGENCE = 0, ///< The iteration reached convergence.
 	LCG_STOP, ///< The iteration is stopped by the monitoring function.
 	LCG_ALREADY_OPTIMIZIED, ///< The initial solution is already optimized.
 	// A negative number means a error
 	LCG_UNKNOWN_ERROR = -1024, ///< Unknown error.
 	LCG_INVILAD_VARIABLE_SIZE, ///< The variable size is negative
 	LCG_INVILAD_MAX_ITERATIONS, ///< The maximal iteration times is negative.
 	LCG_INVILAD_EPSILON, ///< The epsilon is negative.
 	LCG_INVILAD_RESTART_EPSILON, ///< The restart epsilon is negative.
 	LCG_REACHED_MAX_ITERATIONS, ///< Iteration reached maximal limit.
 	LCG_NULL_PRECONDITION_MATRIX, ///< Null precondition matrix.
 	LCG_NAN_VALUE, ///< Nan value.
 	LCG_INVALID_POINTER, ///< Invalid pointer.
 	LCG_INVALID_LAMBDA, ///< Invalid range for lambda.
 	LCG_INVALID_SIGMA, ///< Invalid range for sigma.
 	LCG_INVALID_BETA, ///< Invalid range for beta.
 	LCG_INVALID_MAXIM, ///< Invalid range for maxi_m.
 	LCG_SIZE_NOT_MATCH, ///< Sizes of m and B do not match
 };
 /**
 * @brief      Parameters of the conjugate gradient methods.
 */
 struct lcg_para
 {
 	/**
 	 * Maximal iteration times. The process will continue till the convergence is met
 	 * if this option is set to zero (default).
 	*/
 	int max_iterations;
 	/**
 	 * Epsilon for convergence test.
 	 * This parameter determines the accuracy with which the solution is to be 
 	 * found. A minimization terminates when ||g||/max(||g0||, 1.0) <= epsilon or 
 	 * sqrt(||g||)/N <= epsilon for the lcg_solver() function, where ||.|| denotes 
 	 * the Euclidean (L2) norm. The default value of epsilon is 1e-8.
 	*/
 	lcg_float epsilon;
 	/**
 	 * Whether to use absolute mean differences (AMD) between |Ax - B| to evaluate the process. 
 	 * The default value is false which means the gradient based evaluating method is used. 
 	 * The AMD based method will be used if this variable is set to true. This parameter is only 
 	 * applied to the non-constrained methods.
 	 */
 	int abs_diff;
 	/**
 	 * Restart epsilon for the LCG_BICGSTAB2 algorithm. The default value is 1e-6
 	 */
 	lcg_float restart_epsilon;
 	/**
 	 * Initial step length for the project gradient method. The default is 1.0
 	 */
 	lcg_float step;
 	/**
 	 * multiplier for updating solutions with the spectral projected gradient method. The range of
 	 * this variable is (0, 1). The default is given as 0.95
 	 */
 	lcg_float sigma;
 	/**
 	 * descending ratio for conducting the non-monotonic linear search. The range of
 	 * this variable is (0, 1). The default is given as 0.9
 	 */
 	lcg_float beta;
 	/**
 	 * The maximal record times of the objective values for the SPG method. The method use the 
 	 * objective values from the most recent maxi_m times to preform the non-monotonic linear search.
 	 * The default value is 10.
 	 */
 	int maxi_m;
 };
 /**
 * Default parameter for conjugate gradient methods
 */
 static const lcg_para defparam = {0, 1e-8, 0, 1e-6, 1.0, 0.95, 0.9, 10};
 /**
 * @brief      Return a lcg_para type instance with default values.
 * 
 * Users can use this function to get default parameters' value for the conjugate gradient methods.
 * 
 * @return     A lcg_para type instance.
 */
 lcg_para lcg_default_parameters();
 /**
 * @brief      Select a type of solver according to the name
 *
 * @param[in]  slr_char  Name of the solver
 *
 * @return     The lcg solver enum.
 */
 lcg_solver_enum lcg_select_solver(std::string slr_char);
 /**
 * @brief      Display or throw out a string explanation for the lcg_solver() function's return values.
 *
 * @param[in]  er_index  The error index returned by the lcg_solver() function.
 * @param[in]  er_throw  throw out a char string of the explanation.
 *
 * @return     A string explanation of the error.
 */
 void lcg_error_str(int er_index, bool er_throw = false);
 /**
 * @brief      Types of method that could be recognized by the clcg_solver() function.
 */
 enum clcg_solver_enum
 {
 	/**
 	 * Jacob's Bi-Conjugate Gradient Method
 	 */
 	CLCG_BICG,
 	/**
 	 * Bi-Conjugate Gradient Method accelerated for complex symmetric A
 	 */
 	CLCG_BICG_SYM,
 	/**
 	 * Conjugate Gradient Squared Method with real coefficients.
 	 */
 	CLCG_CGS,
 	/**
 	 * Biconjugate gradient method.
 	 */
 	CLCG_BICGSTAB,
 	/**
 	 * Quasi-Minimal Residual Method
 	 */
 	//CLCG_QMR,
 	/**
 	 * Transpose Free Quasi-Minimal Residual Method
 	 */
 	CLCG_TFQMR,
 	/**
 	 * Preconditioned conjugate gradient
 	 */
 	CLCG_PCG,
 	/**
 	 * Preconditioned Bi-Conjugate Gradient Method
 	 */
 	CLCG_PBICG,
 };
 /**
 * @brief      return value of the clcg_solver() function
 */
 enum clcg_return_enum
 {
 	CLCG_SUCCESS = 0, ///< The solver function terminated successfully.
 	CLCG_CONVERGENCE = 0, ///< The iteration reached convergence.
 	CLCG_STOP, ///< The iteration is stopped by the monitoring function.
 	CLCG_ALREADY_OPTIMIZIED, ///< The initial solution is already optimized.
 	// A negative number means a error
 	CLCG_UNKNOWN_ERROR = -1024, ///< Unknown error.
 	CLCG_INVILAD_VARIABLE_SIZE, ///< The variable size is negative
 	CLCG_INVILAD_MAX_ITERATIONS, ///< The maximal iteration times is negative.
 	CLCG_INVILAD_EPSILON, ///< The epsilon is negative.
 	CLCG_REACHED_MAX_ITERATIONS, ///< Iteration reached maximal limit.
 	CLCG_NAN_VALUE, ///< Nan value.
 	CLCG_INVALID_POINTER, ///< Invalid pointer.
 	CLCG_SIZE_NOT_MATCH, ///< Sizes of m and B do not match
 	CLCG_UNKNOWN_SOLVER, ///< Unknown solver
 };
 /**
 * @brief      Parameters of the conjugate gradient methods.
 */
 struct clcg_para
 {
 	/**
 	 * Maximal iteration times. The process will continue till the convergence is met
 	 * if this option is set to zero (default).
 	*/
 	int max_iterations;
 	/**
 	 * Epsilon for convergence test.
 	 * This parameter determines the accuracy with which the solution is to be found. 
 	 * A minimization terminates when ||g||/max(||g0||, 1.0) <= epsilon or sqrt(||g||)/N 
 	 * <= epsilon for the lcg_solver() function, where ||.|| denotes the Euclidean (L2) norm. 
 	 * The default value of epsilon is 1e-8. For box-constrained methods,the convergence test 
 	 * is implemented using ||P(m-g) - m|| <= epsilon, in which P is the projector that 
 	 * transfers m into the constrained domain.
 	*/
 	lcg_float epsilon;
 	/**
 	 * Whether to use absolute mean differences (AMD) between |Ax - B| to evaluate the process. 
 	 * The default value is false which means the gradient based evaluating method is used. 
 	 * The AMD based method will be used if this variable is set to true. This parameter is only 
 	 * applied to the non-constrained methods.
 	 */
 	int abs_diff;
 };
 /**
 * Default parameter for conjugate gradient methods
 */
 static const clcg_para defparam2 = {0, 1e-8, 0};
 /**
 * @brief      Return a clcg_para type instance with default values.
 * 
 * Users can use this function to get default parameters' value for the complex conjugate gradient methods.
 * 
 * @return     A clcg_para type instance.
 */
 clcg_para clcg_default_parameters();
 /**
 * @brief      Select a type of solver according to the name
 *
 * @param[in]  slr_char  Name of the solver
 *
 * @return     The clcg solver enum.
 */
 clcg_solver_enum clcg_select_solver(std::string slr_char);
 /**
 * @brief      Display or throw out a string explanation for the clcg_solver() function's return values.
 *
 * @param[in]  er_index  The error index returned by the lcg_solver() function.
 * @param[in]  er_throw  throw out a char string of the explanation.
 *
 * @return     A string explanation of the error.
 */
 void clcg_error_str(int er_index, bool er_throw = false);
 #endif // _LCG_UTIL_H
--- a/src/sample/sample1.cpp
+++ b/src/sample/sample1.cpp
@@ -0,0 +1,167 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "cmath"
 #include "iostream"
 #include "../lib/lcg.h"
 #define M 100
 #define N 80
 lcg_float max_diff(const lcg_float *a, const lcg_float *b, int size)
 {
 	lcg_float max = -1;
 	for (int i = 0; i < size; i++)
 	{
 		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
 	}
 	return max;
 }
 // 普通二维数组做核矩阵
 lcg_float **kernel;
 // 中间结果数组
 lcg_float *tmp_arr;
 // 预优矩阵
 lcg_float *p;
 // 计算核矩阵乘向量的乘积
 void CalAx(void* instance, const lcg_float* x, lcg_float* prod_Ax, const int n_s)
 {
 	lcg_matvec(kernel, x, tmp_arr, M, n_s, MatNormal);
 	lcg_matvec(kernel, tmp_arr, prod_Ax, M, n_s, MatTranspose);
 	return;
 }
 void CalMx(void* instance, const lcg_float* x, lcg_float* prod_Mx, const int n_s)
 {
 	for (size_t i = 0; i < n_s; i++)
 	{
 		prod_Mx[i] = p[i]*x[i];
 	}
 	return;
 }
 //定义共轭梯度监控函数
 int Prog(void* instance, const lcg_float* m, const lcg_float converge, const lcg_para* param, const int n_s, const int k)
 {
 	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	return 0;
 }
 int main(int argc, char const *argv[])
 {
 	kernel = lcg_malloc(M, N);
 	tmp_arr = lcg_malloc(M);
 	p = lcg_malloc(N);
 	lcg_vecrnd(kernel, -1.0, 1.0, M, N);
 	// 生成一组正演解
 	lcg_float *fm = lcg_malloc(N);
 	lcg_vecrnd(fm, 1.0, 2.0, N);
 	// 计算共轭梯度B项
 	lcg_float *B = lcg_malloc(N);
 	lcg_matvec(kernel, fm, tmp_arr, M, N, MatNormal);
 	lcg_matvec(kernel, tmp_arr, B, M, N, MatTranspose);
 	/********************准备工作完成************************/
 	lcg_para self_para = lcg_default_parameters();
 	self_para.epsilon = 1e-7;
 	self_para.abs_diff = 0;
 	// 声明一组解
 	lcg_float *m = lcg_malloc(N);
 	lcg_vecset(m, 0.0, N);
 	// 声明一组预优因子
 	lcg_float diag;
 	for (size_t i = 0; i < N; i++)
 	{
 		diag = 0.0;
 		for (size_t j = 0; j < M; j++)
 		{
 			diag += kernel[j][i]*kernel[j][i];
 		}
 		p[i] = 1.0/diag;
 	}
 	// 约束解的范围
 	lcg_float *low = lcg_malloc(N);
 	lcg_float *hig = lcg_malloc(N);
 	lcg_vecset(low, 1.0, N);
 	lcg_vecset(hig, 2.0, N);
 	int ret;
 	std::clog << "solver: cg" << std::endl;
 	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_CG);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	std::clog << "solver: pcg" << std::endl;
 	ret = lcg_solver_preconditioned(CalAx, CalMx, Prog, m, B, N, &self_para, NULL, LCG_PCG);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	std::clog << "solver: cgs" << std::endl;
 	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_CGS);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	std::clog << "solver: bicgstab" << std::endl;
 	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_BICGSTAB);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	std::clog << "solver: bicgstab2" << std::endl;
 	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_BICGSTAB2);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	std::clog << "solver: pg" << std::endl;
 	ret = lcg_solver_constrained(CalAx, Prog, m, B, low, hig, N, &self_para, NULL, LCG_PG);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	std::clog << "solver: spg" << std::endl;
 	ret = lcg_solver_constrained(CalAx, Prog, m, B, low, hig, N, &self_para, NULL, LCG_SPG);
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_free(kernel, M);
 	lcg_free(tmp_arr);
 	lcg_free(fm);
 	lcg_free(B);
 	lcg_free(m);
 	lcg_free(p);
 	lcg_free(low);
 	lcg_free(hig);
 	return 0;
 }
--- a/src/sample/sample10.cu
+++ b/src/sample/sample10.cu
@@ -0,0 +1,318 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/solver_cuda.h"
 // Declare as global variables
 cuDoubleComplex one = {1.0, 0.0};
 cuDoubleComplex zero = {0.0, 0.0};
 void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
 	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new cuDoubleComplex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new cuDoubleComplex[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
 	}
 	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new cuDoubleComplex[*pN]{};
 	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
 {
 	lcg_float avg = 0.0;
 	cuDoubleComplex tmp;
 	for (size_t i = 0; i < n; i++)
 	{
 		tmp = clcg_Zdiff(a[i], b[i]);
 		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
 	}
 	return sqrt(avg)/n;
 }
 class sample10 : public CLCG_CUDA_Solver
 {
 public:
 	sample10(){}
 	virtual ~sample10(){}
 	void solve(std::string inputPath, std::string answerPath);
 	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
 	cusparseOperation_t oper_t)
 	{
 		// Calculate the product of A*x
 		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
 		return;
 	}
 	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
 		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
 		cusparseOperation_t oper_t)
 	{
 		void *d_x, *d_Ax;
 		cusparseDnVecGetValues(x, &d_x);
 		cusparseDnVecGetValues(prod_Ax, &d_Ax);
 		if (use_incomplete_cholesky)
 		{
 			cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (cuDoubleComplex*) d_x, (cuDoubleComplex*) d_pd, 
 				CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 			cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (cuDoubleComplex*) d_pd, (cuDoubleComplex*) d_Ax, 
 				CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 		}
 		else
 		{
 			clcg_vecDvecZ_element_wise((cuDoubleComplex*) d_x, d_pd, (cuDoubleComplex*) d_Ax, n_size);
 		}	
 		return;
 	}
 private:
 	bool use_incomplete_cholesky;
 	int N, nz;
 	int *rowIdxA, *colIdxA;
 	cuDoubleComplex *A, *b;
 	cuDoubleComplex *ans_x;
 	void *d_buf;
 	cusparseSpMatDescr_t smat_A;
 	int *d_rowIdxA; // COO
 	int *d_rowPtrA; // CSR
 	int *d_colIdxA;
 	cuDoubleComplex *d_A;
 	cuDoubleComplex *d_pd;
 	cuDoubleComplex *d_ic;
 	cusparseMatDescr_t descr_A;
 	cusparseMatDescr_t descr_L;
 	csric02Info_t icinfo_A;
 	csrsv2Info_t info_L;
 	csrsv2Info_t info_LT;
 	cuDoubleComplex *host_m;
 	cusparseDnVecDescr_t dvec_tmp;
 };
 void sample10::solve(std::string inputPath, std::string answerPath)
 {
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	// Create handles
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	// Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_pd, N * sizeof(cuDoubleComplex));
 	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
 	// This is just used to get bufferSize;
 	cusparseDnVecDescr_t dvec_tmp;
 	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_C_64F);
 	size_t bufferSize_B;
 	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
 	// --- Start of the preconditioning part ---
 	// Get the diagonal elemenets
 	clcg_smZcsr_get_diagonal(d_rowPtrA, d_colIdxA, d_A, N, d_pd);
 	// Copy A
 	cudaMalloc(&d_ic, nz * sizeof(cuDoubleComplex));
 	cudaMemcpy(d_ic, d_A, nz * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice);
 	// create descriptor for matrix A
 	cusparseCreateMatDescr(&descr_A);
 	// initialize properties of matrix A
 	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_SYMMETRIC);
 	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
 	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
 	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
 	// create descriptor for matrix L
 	cusparseCreateMatDescr(&descr_L);
 	// initialize properties of matrix L
 	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
 	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT);
 	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
 	// Create empty info objects for incomplete-cholesky factorization
 	cusparseCreateCsric02Info(&icinfo_A);
 	cusparseCreateCsrsv2Info(&info_L);
 	cusparseCreateCsrsv2Info(&info_LT);
 	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT;
 	bufferSize = bufferSize_B;
 	// Compute buffer size in computing ic factorization
 	cusparseZcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
 		d_colIdxA, icinfo_A, &bufferSize_A);
 	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
 	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT);
 	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT);
 	cudaMalloc(&d_buf, bufferSize);
 	// Perform incomplete-choleskey factorization: analysis phase
 	cusparseZcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, 
 		d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	// Perform incomplete-choleskey factorization: solve phase
 	cusparseZcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA, 
 		icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	// --- End of the preconditioning part ---
 	// Declare an initial solution
 	host_m = new cuDoubleComplex[N];
    clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	// Preconditioning with Diagonal elements
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
 	use_incomplete_cholesky = false;
 	MinimizePreconditioned(cubHandle, cusHandle, host_m, b, N, nz, CLCG_PCG);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Preconditioning with incomplete-Cholesky factorization
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
 	use_incomplete_cholesky = true;
 	MinimizePreconditioned(cubHandle, cusHandle, host_m, b, N, nz, CLCG_PCG);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	delete[] host_m;
 	// Free Device memory
 	cudaFree(d_A);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
 	cudaFree(d_pd);
 	cudaFree(d_ic);
 	cusparseDestroyDnVec(dvec_tmp);
 	cusparseDestroySpMat(smat_A);
 	cudaFree(d_buf);
 	cusparseDestroyMatDescr(descr_A);
 	cusparseDestroyMatDescr(descr_L);
 	cusparseDestroyCsric02Info(icinfo_A);
 	cusparseDestroyCsrsv2Info(info_L);
 	cusparseDestroyCsrsv2Info(info_LT);
 	// Free handles
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_10K_cA";
 	std::string answerPath = "data/case_10K_cB";
 	sample10 sp;
 	sp.set_report_interval(0);
 	sp.solve(inputPath, answerPath);
 	return 0;
 }
--- a/src/sample/sample11.cu
+++ b/src/sample/sample11.cu
@@ -0,0 +1,299 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/clcg_cuda.h"
 void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
 	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new cuDoubleComplex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new cuDoubleComplex[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
 	}
 	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new cuDoubleComplex[*pN]{};
 	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
 {
 	lcg_float avg = 0.0;
 	cuDoubleComplex tmp;
 	for (size_t i = 0; i < n; i++)
 	{
 		tmp = clcg_Zdiff(a[i], b[i]);
 		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
 	}
 	return sqrt(avg)/n;
 }
 // Declare as global variables
 cuDoubleComplex one, zero;
 void *d_buf;
 cusparseSpMatDescr_t smat_A;
 int *d_rowIdxA; // COO
 int *d_rowPtrA; // CSR
 int *d_colIdxA;
 cuDoubleComplex *d_A;
 cuDoubleComplex *d_pd;
 cuDoubleComplex *d_iu;
 cusparseMatDescr_t descr_A = 0;
 cusparseMatDescr_t descr_L = 0;
 cusparseMatDescr_t descr_U = 0;
 csrilu02Info_t info_ILU = 0;
 csrsv2Info_t info_L = 0;
 csrsv2Info_t info_U = 0;
 void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
 	cusparseOperation_t oper_t)
 {
 	one.x = 1.0; one.y = 0.0;
 	zero.x = 0.0; zero.y = 0.0;
 	// Calculate the product of A*x
 	//cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
 	cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
    return;
 }
 void cudaMx_ILU(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
 	cusparseOperation_t oper_t)
 {
 	void *d_x, *d_Ax;
 	cusparseDnVecGetValues(x, &d_x);
 	cusparseDnVecGetValues(prod_Ax, &d_Ax);
 	one.x = 1.0; one.y = 0.0;
 	cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_size, nz_size, &one, descr_L, d_iu, d_rowPtrA, d_colIdxA, info_L, (cuDoubleComplex*) d_x, (cuDoubleComplex*) d_pd, 
 		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_size, nz_size, &one, descr_U, d_iu, d_rowPtrA, d_colIdxA, info_U, (cuDoubleComplex*) d_pd, (cuDoubleComplex*) d_Ax, 
 		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
    return;
 }
 int cudaProgress(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
 	const clcg_para* param, const int n_size, const int nz_size, const int k)
 {
    if (converge <= param->epsilon) {
 		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
 	}
 	return 0;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_1M_cA";
 	std::string answerPath = "data/case_1M_cB";
 	int N;
 	int nz;
 	cuDoubleComplex *A;
 	int *rowIdxA;
 	int *colIdxA;
 	cuDoubleComplex *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	cuDoubleComplex *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	// Create handles
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	// Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_pd, N * sizeof(cuDoubleComplex));
 	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
 	// This is just used to get bufferSize;
 	cusparseDnVecDescr_t dvec_tmp;
 	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_C_64F);
 	size_t bufferSize_B;
 	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
 	// --- Start of the preconditioning part ---
 	// Copy A
 	cudaMalloc(&d_iu, nz * sizeof(cuDoubleComplex));
 	cudaMemcpy(d_iu, d_A, nz * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice);
 	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_U;
 	bufferSize = bufferSize_B;
 	// create descriptor for matrix A
 	cusparseCreateMatDescr(&descr_A);
 	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
 	// create descriptor for matrix L
 	cusparseCreateMatDescr(&descr_L);
 	// initialize properties of matrix L
 	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
 	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
 	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
 	// create descriptor for matrix U
 	cusparseCreateMatDescr(&descr_U);
 	cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
 	cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
 	cusparseSetMatIndexBase(descr_U, CUSPARSE_INDEX_BASE_ZERO);
 	// Create empty info objects for incomplete-cholesky factorization
 	cusparseCreateCsrilu02Info(&info_ILU);
 	cusparseCreateCsrsv2Info(&info_L);
 	cusparseCreateCsrsv2Info(&info_U);
 	// Compute buffer size in computing ic factorization
 	cusparseZcsrilu02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
 		d_colIdxA, info_ILU, &bufferSize_A);
 	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_L, d_iu, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
 	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_U, d_iu, d_rowPtrA, d_colIdxA, info_U, &bufferSize_U);
 	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_U);
 	cudaMalloc(&d_buf, bufferSize);
 	// Perform incomplete-choleskey factorization: analysis phase
 	cusparseZcsrilu02_analysis(cusHandle, N, nz, descr_A, d_iu, d_rowPtrA, 
 		d_colIdxA, info_ILU, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_L, d_iu, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_U, d_iu, d_rowPtrA, d_colIdxA, info_U, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	// Perform incomplete-choleskey factorization: solve phase
 	cusparseZcsrilu02(cusHandle, N, nz, descr_A, d_iu, d_rowPtrA, d_colIdxA, 
 		info_ILU, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	// --- End of the preconditioning part ---
 	// Declare an initial solution
    clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	int ret;
 	cuDoubleComplex *host_m = new cuDoubleComplex[N];
 	// Preconditioning with incomplete-LU factorization
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
 	ret = clcg_solver_preconditioned_cuda(cudaAx, cudaMx_ILU, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, CLCG_PCG);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	delete[] host_m;
 	// Free Device memory
 	cudaFree(d_A);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
 	cudaFree(d_pd);
 	cudaFree(d_iu);
 	cusparseDestroyDnVec(dvec_tmp);
 	cusparseDestroySpMat(smat_A);
 	cudaFree(d_buf);
 	cusparseDestroyMatDescr(descr_A);
 	cusparseDestroyMatDescr(descr_L);
 	cusparseDestroyMatDescr(descr_U);
 	cusparseDestroyCsrilu02Info(info_ILU);
 	cusparseDestroyCsrsv2Info(info_L);
 	cusparseDestroyCsrsv2Info(info_U);
 	// Free handles
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }
--- a/src/sample/sample12.cu
+++ b/src/sample/sample12.cu
@@ -0,0 +1,306 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/solver_cuda.h"
 #include "../lib/preconditioner_cuda.h"
 // Declare as global variables
 cuDoubleComplex one = {1.0, 0.0};
 cuDoubleComplex zero = {0.0, 0.0};
 void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
 	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new cuDoubleComplex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new cuDoubleComplex[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
 	}
 	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new cuDoubleComplex[*pN]{};
 	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
 {
 	lcg_float avg = 0.0;
 	cuDoubleComplex tmp;
 	for (size_t i = 0; i < n; i++)
 	{
 		tmp = clcg_Zdiff(a[i], b[i]);
 		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
 	}
 	return sqrt(avg)/n;
 }
 class sample12 : public CLCG_CUDA_Solver
 {
 public:
 	sample12(){}
 	virtual ~sample12(){}
 	void solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		// Calculate the product of A*x
 		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
 		return;
 	}
 	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_IC, x, dvec_p, 
 			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L);
 		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_IC, dvec_p, prod_Ax, 
 			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT);
 		return;
 	}
 private:
 	int N, nz;
 	int *rowIdxA, *colIdxA;
 	cuDoubleComplex *A, *b;
 	cuDoubleComplex *ans_x;
 	int *IC_row, *IC_col;
    cuDoubleComplex *IC_val;
 	void *d_buf, *d_buf2;
 	cusparseSpMatDescr_t smat_A;
 	cusparseSpMatDescr_t smat_IC;
 	cusparseSpSVDescr_t descr_L, descr_LT;
 	int *d_rowIdxA; // COO
 	int *d_rowPtrA; // CSR
 	int *d_colIdxA;
 	cuDoubleComplex *d_A;
 	cuDoubleComplex *d_p;
 	cusparseDnVecDescr_t dvec_p;
 	int *d_rowIdxIC; // COO
 	int *d_rowPtrIC; // CSR
 	int *d_colIdxIC;
 	cuDoubleComplex *d_IC;
 	cuDoubleComplex *host_m;
 	cuDoubleComplex *d_t;
 	cusparseDnVecDescr_t dvec_tmp;
 };
 void sample12::solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	IC_row = new int [nz];
    IC_col = new int [nz];
    IC_val = new cuDoubleComplex [nz];
    clcg_incomplete_Cholesky_cuda_full(rowIdxA, colIdxA, A, N, nz, IC_row, IC_col, IC_val);
 /*
 	for (size_t i = 0; i < nz; i++)
 	{
 		if (IC_row[i] >= IC_col[i])
 		{
 			std::cout << IC_row[i] << " " << IC_col[i] << " (" << IC_val[i].x << "," << IC_val[i].y << ")\n";	
 		}
 	}
 */
    // Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_p, N * sizeof(cuDoubleComplex));
    cusparseCreateDnVec(&dvec_p, N, d_p, CUDA_C_64F);
 	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc(&d_IC, nz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxIC, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrIC, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxIC, nz * sizeof(int));
    cudaMemcpy(d_IC, IC_val, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxIC, IC_row, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxIC, IC_col, nz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cus_handle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
 	// Convert matrix L from COO format to CSR format
    cusparseXcoo2csr(cus_handle, d_rowIdxIC, nz, N, d_rowPtrIC, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
    cusparseCreateCsr(&smat_IC, N, N, nz, d_rowPtrIC, d_colIdxIC, d_IC, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
 	// Specify Non-Unit diagonal type.
    //cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
 	//cusparseSpMatSetAttribute(smat_IC, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype));
    // This is just used to get bufferSize;
 	cudaMalloc(&d_t, N * sizeof(cuDoubleComplex));
 	cusparseCreateDnVec(&dvec_tmp, N, d_t, CUDA_C_64F);
 	size_t bufferSize_B;
 	cusparseSpMV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize_B);
    // --- Start of the preconditioning part ---
    cusparseSpSV_createDescr(&descr_L);
    cusparseSpSV_createDescr(&descr_LT);
    size_t bufferSize, bufferSize_L, bufferSize_LT;
 	bufferSize = bufferSize_B;
    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_IC, dvec_p, 
        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, &bufferSize_L);
    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_IC, dvec_p, 
        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, &bufferSize_LT);
    bufferSize = max(max(bufferSize, bufferSize_L), bufferSize_LT);
 	cudaMalloc(&d_buf, bufferSize);
 	cudaMalloc(&d_buf2, bufferSize);
 	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_IC, dvec_tmp, dvec_p, 
 		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, d_buf);
 	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_IC, dvec_p, dvec_tmp, 
 		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, d_buf2);
 	// --- End of the preconditioning part ---
 	// Declare an initial solution
    clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	host_m = new cuDoubleComplex[N];
 	// Preconditioning with incomplete-chelosky factorization
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
 	MinimizePreconditioned(cub_handle, cus_handle, host_m, b, N, nz, CLCG_PCG);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	if (rowIdxA != nullptr) delete[] rowIdxA;
 	if (colIdxA != nullptr) delete[] colIdxA;
    if (A != nullptr) delete[] A;
 	if (b != nullptr) delete[] b;
 	if (ans_x != nullptr) delete[] ans_x;
    if (IC_row != nullptr) delete[] IC_row;
    if (IC_col != nullptr) delete[] IC_col;
    if (IC_val != nullptr) delete[] IC_val;
    if (host_m != nullptr) delete[] host_m;
 	cusparseDestroyDnVec(dvec_tmp);
    cusparseDestroyDnVec(dvec_p);
 	cudaFree(d_buf);
 	cudaFree(d_buf2);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
    cudaFree(d_A);
 	cudaFree(d_p);
 	cudaFree(d_t);
    cudaFree(d_rowIdxIC);
 	cudaFree(d_rowPtrIC);
 	cudaFree(d_colIdxIC);
    cudaFree(d_IC);
 	cusparseDestroySpMat(smat_A);
 	cusparseDestroySpMat(smat_IC);
    cusparseSpSV_destroyDescr(descr_L);
    cusparseSpSV_destroyDescr(descr_LT);
 	return;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_1M_cA";
 	std::string answerPath = "data/case_1M_cB";
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	sample12 sp;
 	sp.set_report_interval(0);
 	sp.solve(inputPath, answerPath, cubHandle, cusHandle);
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }
--- a/src/sample/sample13.cu
+++ b/src/sample/sample13.cu
@@ -0,0 +1,305 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/solver_cuda.h"
 #include "../lib/preconditioner_cuda.h"
 // Declare as global variables
 cuDoubleComplex one = {1.0, 0.0};
 cuDoubleComplex zero = {0.0, 0.0};
 void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
 	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new cuDoubleComplex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new cuDoubleComplex[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
 	}
 	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new cuDoubleComplex[*pN]{};
 	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
 {
 	lcg_float avg = 0.0;
 	cuDoubleComplex tmp;
 	for (size_t i = 0; i < n; i++)
 	{
 		tmp = clcg_Zdiff(a[i], b[i]);
 		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
 	}
 	return sqrt(avg)/n;
 }
 class sample13 : public CLCG_CUDA_Solver
 {
 public:
 	sample13(){}
 	virtual ~sample13(){}
 	void solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		// Calculate the product of A*x
 		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_tuf);
 		return;
 	}
 	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, x, dvec_p, 
 			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L);
 		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, prod_Ax, 
 			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT);
 		return;
 	}
 private:
 	int N, nz, lnz;
 	int *rowIdxA, *colIdxA;
 	cuDoubleComplex *A, *b;
 	cuDoubleComplex *ans_x;
 	int *L_row, *L_col;
    cuDoubleComplex *L_val;
 	void *d_tuf, *d_tuf2;
 	cusparseSpMatDescr_t smat_A;
 	cusparseSpMatDescr_t smat_L;
 	cusparseSpSVDescr_t descr_L, descr_LT;
 	int *d_rowIdxA; // COO
 	int *d_rowPtrA; // CSR
 	int *d_colIdxA;
 	cuDoubleComplex *d_A;
 	cuDoubleComplex *d_t;
 	cuDoubleComplex *d_p;
 	cusparseDnVecDescr_t dvec_p;
 	int *d_rowIdxL; // COO
 	int *d_rowPtrL; // CSR
 	int *d_colIdxL;
 	cuDoubleComplex *d_L;
 	cuDoubleComplex *host_m;
 	cusparseDnVecDescr_t dvec_tmp;
 };
 void sample13::solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	readAnswer(answerPath, &N, &ans_x);
    clcg_incomplete_Cholesky_cuda_half_buffsize(rowIdxA, colIdxA, nz, &lnz);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
    std::clog << "lnz = " << lnz << std::endl;
 	L_row = new int [lnz];
    L_col = new int [lnz];
    L_val = new cuDoubleComplex [lnz];
    clcg_incomplete_Cholesky_cuda_half(rowIdxA, colIdxA, A, N, nz, lnz, L_row, L_col, L_val);
 /*
    for (size_t i = 0; i < lnz; i++)
    {
        std::cout << L_row[i] << " " << L_col[i] << " (" << L_val[i].x << "," << L_val[i].y << ")\n";
    }
 */
    // Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_t, N * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_p, N * sizeof(cuDoubleComplex));
    cusparseCreateDnVec(&dvec_p, N, d_p, CUDA_C_64F);
 	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc(&d_L, lnz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxL, lnz * sizeof(int));
 	cudaMalloc(&d_rowPtrL, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxL, lnz * sizeof(int));
    cudaMemcpy(d_L, L_val, lnz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxL, L_row, lnz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxL, L_col, lnz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cus_handle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
 	// Convert matrix L from COO format to CSR format
    cusparseXcoo2csr(cus_handle, d_rowIdxL, lnz, N, d_rowPtrL, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
    cusparseCreateCsr(&smat_L, N, N, lnz, d_rowPtrL, d_colIdxL, d_L, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
    // Specify Lower fill mode.
    cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_LOWER;
 	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode));
 	// Specify Non-Unit diagonal type.
    cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
 	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype));
    // This is just used to get bufferSize;
 	cusparseCreateDnVec(&dvec_tmp, N, d_t, CUDA_C_64F);
 	size_t bufferSize_B;
 	cusparseSpMV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize_B);
    // --- Start of the preconditioning part ---
    cusparseSpSV_createDescr(&descr_L);
    cusparseSpSV_createDescr(&descr_LT);
    size_t bufferSize, bufferSize_L, bufferSize_LT;
 	bufferSize = bufferSize_B;
    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_p, 
        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, &bufferSize_L);
    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, 
        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, &bufferSize_LT);
    bufferSize = max(max(bufferSize, bufferSize_L), bufferSize_LT);
 	cudaMalloc(&d_tuf, bufferSize);
 	cudaMalloc(&d_tuf2, bufferSize);
 	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_tmp, dvec_p, 
 		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, d_tuf);
 	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, dvec_tmp, 
 		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, d_tuf2);
 	// --- End of the preconditioning part ---
 	// Declare an initial solution
    clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	// Preconditioning with incomplete-chelosky factorization
 	host_m = clcg_malloc_cuda(N);
 	clcg_vecset_cuda(host_m, zero, N);
 	MinimizePreconditioned(cub_handle, cus_handle, host_m, b, N, nz, CLCG_PCG);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	if (rowIdxA != nullptr) delete[] rowIdxA;
 	if (colIdxA != nullptr) delete[] colIdxA;
    if (A != nullptr) delete[] A;
 	if (b != nullptr) delete[] b;
 	if (ans_x != nullptr) delete[] ans_x;
    if (L_row != nullptr) delete[] L_row;
    if (L_col != nullptr) delete[] L_col;
    if (L_val != nullptr) delete[] L_val;
 	clcg_free_cuda(host_m);
 	cusparseDestroyDnVec(dvec_tmp);
    cusparseDestroyDnVec(dvec_p);
 	cudaFree(d_tuf);
 	cudaFree(d_tuf2);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
    cudaFree(d_A);
 	cudaFree(d_t);
 	cudaFree(d_p);
    cudaFree(d_rowIdxL);
 	cudaFree(d_rowPtrL);
 	cudaFree(d_colIdxL);
    cudaFree(d_L);
 	cusparseDestroySpMat(smat_A);
 	cusparseDestroySpMat(smat_L);
    cusparseSpSV_destroyDescr(descr_L);
    cusparseSpSV_destroyDescr(descr_LT);
 	return;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_10K_cA";
 	std::string answerPath = "data/case_10K_cB";
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	sample13 sp;
 	sp.set_report_interval(0);
 	sp.solve(inputPath, answerPath, cubHandle, cusHandle);
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }
--- a/src/sample/sample14.cu
+++ b/src/sample/sample14.cu
@@ -0,0 +1,327 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/solver_cuda.h"
 #include "../lib/preconditioner_cuda.h"
 // Declare as global variables
 cuComplex one = {1.0, 0.0};
 cuComplex zero = {0.0, 0.0};
 void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
 	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new cuDoubleComplex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new cuDoubleComplex[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
 	}
 	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new cuDoubleComplex[*pN]{};
 	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 float avg_error(cuComplex *a, cuComplex *b, int n)
 {
 	float avg = 0.0;
 	cuComplex tmp;
 	for (size_t i = 0; i < n; i++)
 	{
 		tmp = clcg_Cdiff(a[i], b[i]);
 		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
 	}
 	return sqrt(avg)/n;
 }
 class sample14 : public CLCG_CUDAF_Solver
 {
 public:
 	sample14(){}
 	virtual ~sample14(){}
 	void solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
 	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		// Calculate the product of A*x
 		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_32F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
 		return;
 	}
 	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
 		const int n_size, const int nz_size, cusparseOperation_t oper_t)
 	{
 		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, x, dvec_p, 
 			CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L);
 		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, prod_Ax, 
 			CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT);
 		return;
 	}
 private:
 	int N, nz, lnz;
 	int *rowIdxA, *colIdxA;
 	cuDoubleComplex *A, *b;
 	cuDoubleComplex *ans_x;
    cuComplex *Af, *bf;
 	cuComplex *ans_xf;
 	int *L_row, *L_col;
    cuComplex *L_val;
 	void *d_buf, *d_buf2;
 	cusparseSpMatDescr_t smat_A;
 	cusparseSpMatDescr_t smat_L;
 	cusparseSpSVDescr_t descr_L, descr_LT;
 	int *d_rowIdxA; // COO
 	int *d_rowPtrA; // CSR
 	int *d_colIdxA;
 	cuComplex *d_A;
 	cuComplex *d_t;
 	cuComplex *d_p;
 	cusparseDnVecDescr_t dvec_p;
 	int *d_rowIdxL; // COO
 	int *d_rowPtrL; // CSR
 	int *d_colIdxL;
 	cuComplex *d_L;
 	cuComplex *host_m;
 	cusparseDnVecDescr_t dvec_tmp;
 };
 void sample14::solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
 {
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	readAnswer(answerPath, &N, &ans_x);
    clcg_incomplete_Cholesky_cuda_half_buffsize(rowIdxA, colIdxA, nz, &lnz);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
    std::clog << "lnz = " << lnz << std::endl;
    Af = new cuComplex [nz];
    bf = new cuComplex [nz];
    ans_xf = new cuComplex [nz];
 	// Note that converting complex numbers from double to single precisions may case stack overflow
    for (size_t i = 0; i < nz; i++)
    {
        Af[i].x = A[i].x; Af[i].y = A[i].y;
        bf[i].x = b[i].x; bf[i].y = b[i].y;
        ans_xf[i].x = ans_x[i].x; ans_xf[i].y = ans_x[i].y;
    }
 	L_row = new int [lnz];
    L_col = new int [lnz];
    L_val = new cuComplex [lnz];
    clcg_incomplete_Cholesky_cuda_half(rowIdxA, colIdxA, Af, N, nz, lnz, L_row, L_col, L_val);
 /*
    for (size_t i = 0; i < lnz; i++)
    {
        std::cout << L_row[i] << " " << L_col[i] << " (" << L_val[i].x << "," << L_val[i].y << ")\n";
    }
 */
    // Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(cuComplex));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_t, N * sizeof(cuComplex));
 	cudaMalloc(&d_p, N * sizeof(cuComplex));
    cusparseCreateDnVec(&dvec_p, N, d_p, CUDA_C_32F);
 	cudaMemcpy(d_A, Af, nz * sizeof(cuComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_t, bf, N * sizeof(cuComplex), cudaMemcpyHostToDevice);
    cudaMalloc(&d_L, lnz * sizeof(cuComplex));
 	cudaMalloc(&d_rowIdxL, lnz * sizeof(int));
 	cudaMalloc(&d_rowPtrL, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxL, lnz * sizeof(int));
    cudaMemcpy(d_L, L_val, lnz * sizeof(cuComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxL, L_row, lnz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxL, L_col, lnz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cus_handle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F);
 	// Convert matrix L from COO format to CSR format
    cusparseXcoo2csr(cus_handle, d_rowIdxL, lnz, N, d_rowPtrL, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
    cusparseCreateCsr(&smat_L, N, N, lnz, d_rowPtrL, d_colIdxL, d_L, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F);
    // Specify Lower fill mode.
    cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_LOWER;
 	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode));
 	// Specify Non-Unit diagonal type.
    cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
 	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype));
    // This is just used to get bufferSize;
 	cusparseCreateDnVec(&dvec_tmp, N, d_t, CUDA_C_32F);
 	size_t bufferSize_B;
 	cusparseSpMV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize_B);
    // --- Start of the preconditioning part ---
    cusparseSpSV_createDescr(&descr_L);
    cusparseSpSV_createDescr(&descr_LT);
    size_t bufferSize, bufferSize_L, bufferSize_LT;
 	bufferSize = bufferSize_B;
    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_p, 
        dvec_tmp, CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, &bufferSize_L);
    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, 
        dvec_tmp, CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, &bufferSize_LT);
    bufferSize = max(max(bufferSize, bufferSize_L), bufferSize_LT);
 	cudaMalloc(&d_buf, bufferSize);
 	cudaMalloc(&d_buf2, bufferSize);
 	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_tmp, dvec_p, 
 		CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, d_buf);
 	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, dvec_tmp, 
 		CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, d_buf2);
 	// --- End of the preconditioning part ---
 	// Declare an initial solution
    clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	host_m = new cuComplex[N];
 	// Preconditioning with incomplete-chelosky factorization
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
 	MinimizePreconditioned(cub_handle, cus_handle, host_m, bf, N, nz, CLCG_PCG);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_xf, N) << std::endl;
 	// Free Host memory
 	if (rowIdxA != nullptr) delete[] rowIdxA;
 	if (colIdxA != nullptr) delete[] colIdxA;
    if (A != nullptr) delete[] A;
 	if (b != nullptr) delete[] b;
 	if (ans_x != nullptr) delete[] ans_x;
    if (Af != nullptr) delete[] Af;
 	if (bf != nullptr) delete[] bf;
 	if (ans_xf != nullptr) delete[] ans_xf;
    if (L_row != nullptr) delete[] L_row;
    if (L_col != nullptr) delete[] L_col;
    if (L_val != nullptr) delete[] L_val;
    if (host_m != nullptr) delete[] host_m;
 	cusparseDestroyDnVec(dvec_tmp);
    cusparseDestroyDnVec(dvec_p);
 	cudaFree(d_buf);
 	cudaFree(d_buf2);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
    cudaFree(d_A);
 	cudaFree(d_t);
 	cudaFree(d_p);
    cudaFree(d_rowIdxL);
 	cudaFree(d_rowPtrL);
 	cudaFree(d_colIdxL);
    cudaFree(d_L);
 	cusparseDestroySpMat(smat_A);
 	cusparseDestroySpMat(smat_L);
    cusparseSpSV_destroyDescr(descr_L);
    cusparseSpSV_destroyDescr(descr_LT);
 	return;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_1K_cA";
 	std::string answerPath = "data/case_1K_cB";
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	sample14 sp;
 	sp.set_report_interval(100);
 	sp.solve(inputPath, answerPath, cubHandle, cusHandle);
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }
--- a/src/sample/sample15.cu
+++ b/src/sample/sample15.cu
@@ -0,0 +1,223 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/lcg_cuda.h"
 void read(std::string filePath, int *pN, int *pnz, double **cooVal,
 	int **cooRowIdx, int **cooColIdx, double **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new double[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new double[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(double));
 	}
 	in.read((char*)(*b), sizeof(double)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, double **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new double[*pN]{};
 	in.read((char*)(*x), sizeof(double)*(*pN));
    return;
 }
 lcg_float avg_error(lcg_float *a, lcg_float *b, int n)
 {
 	lcg_float avg = 0.0;
 	for (size_t i = 0; i < n; i++)
 	{
 		avg += (a[i] - b[i])*(a[i] - b[i]);
 	}
 	return sqrt(avg)/n;
 }
 // Declare as global variables
 lcg_float one = 1.0;
 lcg_float zero = 0.0;
 void *d_buf;
 cusparseSpMatDescr_t smat_A;
 int *d_rowIdxA; // COO
 int *d_rowPtrA; // CSR
 int *d_colIdxA;
 double *d_A;
 cusparseMatDescr_t descr_A = 0;
 csric02Info_t icinfo_A = 0;
 void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
 {
 	// Calculate the product of A*x
 	cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
    return;
 }
 int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, 
 	const lcg_para* param, const int n_size, const int nz_size, const int k)
 {
    if (converge <= param->epsilon) {
 		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
 	}
 	return 0;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_1M_A";
 	std::string answerPath = "data/case_1M_B";
 	int N;
 	int nz;
 	double *A;
 	int *rowIdxA;
 	int *colIdxA;
 	double *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	double *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	// Create handles
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	// Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(double));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
 	// create descriptor for matrix A
 	cusparseCreateMatDescr(&descr_A);
 	// initialize properties of matrix A
 	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
 	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
 	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
 	int bufferSize;
 	cusparseCreateCsric02Info(&icinfo_A);
 	cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
 		d_colIdxA, icinfo_A, &bufferSize);
 	cudaMalloc(&d_buf, bufferSize);
 	// Declare an initial solution
    lcg_para self_para = lcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	int ret;
 	double *host_m = new double[N];
 	// Solve with CG
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i] = 0.0;
 	}
    ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Solve with CGS
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i] = 0.0;
 	}
 	ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	delete[] host_m;
 	// Free Device memory
 	cudaFree(d_A);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
 	cusparseDestroySpMat(smat_A);
 	cudaFree(d_buf);
 	cusparseDestroyMatDescr(descr_A);
 	cusparseDestroyCsric02Info(icinfo_A);
 	// Free handles
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }
--- a/src/sample/sample2.cpp
+++ b/src/sample/sample2.cpp
@@ -0,0 +1,170 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "iostream"
 #include "random"
 #include "../lib/solver.h"
 #define M 1000
 #define N 800
 lcg_float max_diff(const lcg_float *a, const lcg_float *b, int size)
 {
 	lcg_float max = -1;
 	for (int i = 0; i < size; i++)
 	{
 		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
 	}
 	return max;
 }
 class TESTFUNC : public LCG_Solver
 {
 public:
 	TESTFUNC();
 	~TESTFUNC();
 	// 计算共轭梯度的B项
 	void cal_partb(lcg_float *B, const lcg_float *x);
 	//定义共轭梯度中Ax的算法
 	void AxProduct(const lcg_float* a, lcg_float* b, const int num)
 	{
 		lcg_matvec(kernel, a, tmp_arr, M, num, MatNormal);
 		lcg_matvec(kernel, tmp_arr, b, M, num, MatTranspose);
 		return;
 	}
 	void MxProduct(const lcg_float* a, lcg_float* b, const int num)
 	{
 		for (size_t i = 0; i < num; i++)
 		{
 			b[i] = p[i]*a[i];
 		}
 		return;
 	}
 private:
 	// 普通二维数组做核矩阵
 	lcg_float **kernel;
 	// 中间结果数组
 	lcg_float *tmp_arr;
 	// 预优矩阵
 	lcg_float *p;
 };
 TESTFUNC::TESTFUNC()
 {
 	kernel = lcg_malloc(M, N);
 	tmp_arr = lcg_malloc(M);
 	p = lcg_malloc(N);
 	lcg_vecrnd(kernel, -1.0, 1.0, M, N);
 	lcg_vecset(p, 1.0, N);
 	lcg_float diag;
 	for (size_t i = 0; i < N; i++)
 	{
 		diag = 0.0;
 		for (size_t j = 0; j < M; j++)
 		{
 			diag += kernel[j][i]*kernel[j][i];
 		}
 		p[i] = 1.0/diag;
 	}
 }
 TESTFUNC::~TESTFUNC()
 {
 	lcg_free(kernel, M);
 	lcg_free(tmp_arr);
 	lcg_free(p);
 }
 void TESTFUNC::cal_partb(lcg_float *B, const lcg_float *x)
 {
 	lcg_matvec(kernel, x, tmp_arr, M, N, MatNormal);
 	lcg_matvec(kernel, tmp_arr, B, M, N, MatTranspose);
 }
 int main(int argc, char const *argv[])
 {
 	// 生成一组正演解
 	double *fm = lcg_malloc(N);
 	lcg_vecrnd(fm, 1.0, 2.0, N);
 	TESTFUNC test;
 	// 计算共轭梯度B项
 	double *B = lcg_malloc(N);
 	test.cal_partb(B, fm);
 	/********************准备工作完成************************/
 	lcg_para self_para = lcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	test.set_lcg_parameter(self_para);
 	// 声明一组解
 	lcg_float *m = lcg_malloc(N);
 	lcg_vecset(m, 0.0, N);
 	// 约束解的范围
 	lcg_float *low = lcg_malloc(N);
 	lcg_float *hig = lcg_malloc(N);
 	lcg_vecset(low, 1.0, N);
 	lcg_vecset(hig, 2.0, N);
 	test.Minimize(m, B, N, LCG_CG);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	test.MinimizePreconditioned(m, B, N);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	test.Minimize(m, B, N, LCG_CGS);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	test.Minimize(m, B, N, LCG_BICGSTAB);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	test.Minimize(m, B, N, LCG_BICGSTAB2);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	test.MinimizeConstrained(m, B, low, hig, N, LCG_PG);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_vecset(m, 0.0, N);
 	test.MinimizeConstrained(m, B, low, hig, N, LCG_SPG);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	lcg_free(fm);
 	lcg_free(B);
 	lcg_free(m);
 	lcg_free(low);
 	lcg_free(hig);
 	return 0;
 }
--- a/src/sample/sample3.cpp
+++ b/src/sample/sample3.cpp
@@ -0,0 +1,129 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "iostream"
 #include "../lib/clcg.h"
 #define N 100
 lcg_float max_diff(const lcg_complex *a, const lcg_complex *b, int size)
 {
 	lcg_float max = -1;
 	lcg_complex t;
 	for (int i = 0; i < size; i++)
 	{
 		t = a[i] - b[i];
 		max = lcg_max(clcg_module(&t), max);
 	}
 	return max;
 }
 // 普通二维数组做核矩阵
 lcg_complex **kernel;
 // 计算核矩阵乘向量的乘积
 void CalAx(void *instance, const lcg_complex *x, lcg_complex *prod_Ax, 
 	const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate)
 {
 	clcg_matvec(kernel, x, prod_Ax, N, x_size, layout, conjugate);
 	return;
 }
 //定义共轭梯度监控函数
 int Prog(void* instance, const lcg_complex* m, const lcg_float converge, 
 	const clcg_para* param, const int n_size, const int k)
 {
 	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	return 0;
 }
 int main(int argc, char const *argv[])
 {
 	srand(time(0));
 	kernel = clcg_malloc(N, N);
 	clcg_vecrnd(kernel, lcg_complex(-1.0, -1.0), lcg_complex(1.0, 1.0), N, N);
 	// 设置核矩阵为一个对称阵
 	for (int i = 0; i < N; i++)
 	{
 		for (int j = i; j < N; j++)
 		{
 			kernel[j][i] = kernel[i][j];
 		}
 	}
 	// 生成一组正演解
 	lcg_complex *fm = clcg_malloc(N);
 	clcg_vecrnd(fm, lcg_complex(1.0, 1.0), lcg_complex(2.0, 2.0), N);
 	// 计算共轭梯度B项
 	lcg_complex *B = clcg_malloc(N);
 	clcg_matvec(kernel, fm, B, N, N, MatNormal, NonConjugate);
 	/********************准备工作完成************************/
 	clcg_para self_para = clcg_default_parameters();
 	self_para.abs_diff = 0;
 	self_para.epsilon = 1e-8;
 	// 声明一组解
 	lcg_complex *m = clcg_malloc(N);
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	int ret;
 	std::clog << "solver: bicg" << std::endl;
 	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_BICG);
 	std::clog << std::endl; clcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	std::clog << "solver: bicg-symmetric" << std::endl;
 	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_BICG_SYM);
 	std::clog << std::endl; clcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	std::clog << "solver: cgs" << std::endl;
 	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_CGS);
 	std::clog << std::endl; clcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	std::clog << "solver: bicgstab" << std::endl;
 	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_BICGSTAB);
 	std::clog << std::endl; clcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	std::clog << "solver: tfqmr" << std::endl;
 	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_TFQMR);
 	std::clog << std::endl; clcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
 	clcg_free(kernel, N);
 	clcg_free(fm);
 	clcg_free(B);
 	clcg_free(m);
 	return 0;
 }
--- a/src/sample/sample4.cpp
+++ b/src/sample/sample4.cpp
@@ -0,0 +1,199 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "../lib/solver.h"
 #include "ctime"
 #include "random"
 #include "iostream"
 #include "fstream"
 #include "iomanip"
 #include "complex"
 void read(std::string filePath, int *pN, int *pnz, lcg_complex **cooVal,
 	int **cooRowIdx, int **cooColIdx, lcg_complex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new lcg_complex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new lcg_complex[*pN]{};
 	std::complex<double> std_c;
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&std_c, sizeof(std_c));
 		(*cooVal)[i].real(std_c.real());
 		(*cooVal)[i].imag(std_c.imag());
 	}
 	for (int i = 0; i < *pN; i++)
 	{
 		in.read((char*)&std_c, sizeof(std_c));
 		(*b)[i].real(std_c.real());
 		(*b)[i].imag(std_c.imag());
 	}
    return;
 }
 void readAnswer(std::string filePath, int *pN, lcg_complex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new lcg_complex[*pN]{};
 	std::complex<double> std_c;
 	for (size_t i = 0; i < *pN; i++)
 	{
 		in.read((char*)&std_c, sizeof(std_c));
 		(*x)[i].real(std_c.real());
 		(*x)[i].imag(std_c.imag());
 	}
    return;
 }
 lcg_float max_diff(const lcg_complex *a, const lcg_complex *b, int size)
 {
 	lcg_float max = -1;
 	lcg_complex t;
 	for (int i = 0; i < size; i++)
 	{
 		t = a[i] - b[i];
 		max = lcg_max(clcg_module(&t), max);
 	}
 	return max;
 }
 class TESTFUNC : public CLCG_Solver
 {
 public:
 	TESTFUNC(int n);
 	~TESTFUNC();
 	void set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size);
 	//定义共轭梯度中Ax的算法
 	void AxProduct(const lcg_complex *x, lcg_complex *prod_Ax, const int x_size, 
 		lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		clcg_matvec(kernel, x, prod_Ax, x_size, x_size, layout, conjugate);
 		return;
 	}
 private:
 	// 普通二维数组做核矩阵
 	lcg_complex **kernel;
 	int n_size;
 };
 TESTFUNC::TESTFUNC(int n)
 {
 	n_size = n;
 	kernel = clcg_malloc(n_size, n_size);
 }
 TESTFUNC::~TESTFUNC()
 {
 	clcg_free(kernel, n_size);
 }
 void TESTFUNC::set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size)
 {
 	for (size_t i = 0; i < n_size; i++)
 	{
 		for (size_t j = 0; j < n_size; j++)
 		{
 			kernel[i][j] = lcg_complex(0.0, 0.0);
 		}
 	}
 	for (size_t i = 0; i < nz_size; i++)
 	{
 		kernel[row_id[i]][col_id[i]] = val[i];
 	}
 	return;
 }
 int main(int argc, char const *argv[])
 {
 	std::string inputPath = "data/case_1K_cA";
 	std::string answerPath = "data/case_1K_cB";
 	int N;
 	int nz;
 	lcg_complex *A;
 	int *rowIdxA;
 	int *colIdxA;
 	lcg_complex *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	lcg_complex *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	TESTFUNC test(N);
 	test.set_kernel(rowIdxA, colIdxA, A, nz);
 	/********************准备工作完成************************/
 	clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-8;
 	self_para.abs_diff = 0;
 	test.set_clcg_parameter(self_para);
 	// 声明一组解
 	lcg_complex *m = clcg_malloc(N);
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	test.Minimize(m, b, N, CLCG_BICG);
 	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	test.Minimize(m, b, N, CLCG_BICG_SYM);
 	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	test.Minimize(m, b, N, CLCG_CGS);
 	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
 	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
 	test.Minimize(m, b, N, CLCG_TFQMR);
 	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
 	clcg_free(m);
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	return 0;
 }
--- a/src/sample/sample5.cpp
+++ b/src/sample/sample5.cpp
@@ -0,0 +1,155 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "../lib/lcg_eigen.h"
 #include "iostream"
 #include "Eigen/Dense"
 #define M 1000
 #define N 800
 lcg_float max_diff(const Eigen::VectorXd &a, const Eigen::VectorXd &b)
 {
 	lcg_float max = -1;
 	for (int i = 0; i < a.size(); i++)
 	{
 		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
 	}
 	return max;
 }
 // 普通二维数组做核矩阵
 Eigen::MatrixXd kernel = Eigen::MatrixXd::Random(M, N);
 // 中间结果数组
 Eigen::VectorXd tmp_arr(M);
 Eigen::VectorXd p = Eigen::VectorXd::Constant(N, 1.0);
 // 计算核矩阵乘向量的乘积
 void CalAx(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax)
 {
 	tmp_arr = kernel * x;
 	prod_Ax = kernel.transpose() * tmp_arr;
 	return;
 }
 void CalMx(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Mx)
 {
 	prod_Mx = p.cwiseProduct(x);
 	return;
 }
 //定义共轭梯度监控函数
 int Prog(void* instance, const Eigen::VectorXd *m, const lcg_float converge, 
 	const lcg_para *param, const int k)
 {
 	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
 	return 0;
 }
 int main(int argc, char const *argv[])
 {
 	// 生成一组正演解
 	lcg_float LO = 1.0, HI = 2.0, Range = HI - LO;
 	Eigen::VectorXd fm = Eigen::VectorXd::Random(N);
 	fm = (fm + Eigen::VectorXd::Constant(N, 1.0))*0.5*Range;
 	fm = (fm + Eigen::VectorXd::Constant(N, LO));
 	// 计算共轭梯度B项
 	Eigen::VectorXd B(N);
 	tmp_arr = kernel * fm;
 	B = kernel.transpose() * tmp_arr;
 	/********************准备工作完成************************/
 	lcg_para self_para = lcg_default_parameters();
 	self_para.epsilon = 1e-5;
 	self_para.abs_diff = 0;
 	// 声明一组解
 	Eigen::VectorXd m = Eigen::VectorXd::Zero(N);
 	//Eigen::VectorXd p = Eigen::VectorXd::Constant(N, 1.0);
 	Eigen::VectorXd low = Eigen::VectorXd::Constant(N, LO);
 	Eigen::VectorXd hig = Eigen::VectorXd::Constant(N, HI);
 	std::clog << "solver: cg" << std::endl;
 	clock_t start = clock();
 	int ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_CG);
 	clock_t end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	m.setZero();
 	std::clog << "solver: pcg" << std::endl;
 	start = clock();
 	ret = lcg_solver_preconditioned_eigen(CalAx, CalMx, Prog, m, B, &self_para, NULL, LCG_PCG);
 	end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	m.setZero();
 	std::clog << "solver: cgs" << std::endl;
 	start = clock();
 	ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_CGS);
 	end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	m.setZero();
 	std::clog << "solver: bicgstab" << std::endl;
 	start = clock();
 	ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_BICGSTAB);
 	end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	m.setZero();
 	std::clog << "solver: bicgstab2" << std::endl;
 	start = clock();
 	ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_BICGSTAB2);
 	end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	m.setZero();
 	std::clog << "solver: pg" << std::endl;
 	start = clock();
 	ret = lcg_solver_constrained_eigen(CalAx, Prog, m, B, low, hig, &self_para, NULL, LCG_PG);
 	end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	m.setZero();
 	std::clog << "solver: spg" << std::endl;
 	start = clock();
 	ret = lcg_solver_constrained_eigen(CalAx, Prog, m, B, low, hig, &self_para, NULL, LCG_SPG);
 	end = clock();
 	std::clog << std::endl; lcg_error_str(ret);
 	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
 	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
 	return 0;
 }
--- a/src/sample/sample6.cpp
+++ b/src/sample/sample6.cpp
@@ -0,0 +1,235 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "iostream"
 #include "fstream"
 #include "complex"
 #include "../lib/lcg_complex.h"
 #include "../lib/solver_eigen.h"
 #include "Eigen/Sparse"
 typedef Eigen::SparseMatrix<lcg_complex, Eigen::RowMajor> spmat_cd; // 注意Eigen默认的稀疏矩阵排序为列优先
 typedef Eigen::Triplet<lcg_complex> triplt_cd;
 void read(std::string filePath, int *pN, int *pnz, lcg_complex **cooVal, 
 	int **cooRowIdx, int **cooColIdx, lcg_complex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new lcg_complex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new lcg_complex[*pN]{};
 	std::complex<double> std_c;
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&std_c, sizeof(std_c));
 		(*cooVal)[i].real(std_c.real());
 		(*cooVal)[i].imag(std_c.imag());
 	}
 	for (int i = 0; i < *pN; i++)
 	{
 		in.read((char*)&std_c, sizeof(std_c));
 		(*b)[i].real(std_c.real());
 		(*b)[i].imag(std_c.imag());
 	}
    return;
 }
 void readAnswer(std::string filePath, int *pN, lcg_complex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new lcg_complex[*pN]{};
 	std::complex<double> std_c;
 	for (size_t i = 0; i < *pN; i++)
 	{
 		in.read((char*)&std_c, sizeof(std_c));
 		(*x)[i].real(std_c.real());
 		(*x)[i].imag(std_c.imag());
 	}
    return;
 }
 lcg_float max_diff(const Eigen::VectorXcd &a, const Eigen::VectorXcd &b)
 {
 	lcg_float max = -1;
 	std::complex<lcg_float> t;
 	for (int i = 0; i < a.size(); i++)
 	{
 		t = a[i] - b[i];
 		max = lcg_max(t.real()*t.real() + t.imag()*t.imag(), max);
 	}
 	return max;
 }
 class TESTFUNC : public CLCG_EIGEN_Solver
 {
 public:
 	TESTFUNC(int n);
 	~TESTFUNC();
 	void set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size);
 	void set_p();
 	//定义共轭梯度中Ax的算法
 	void AxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
 		lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		if (conjugate == Conjugate) prod_Ax = kernel.conjugate() * x;
 		else prod_Ax = kernel * x;
 		return;
 	}
 	void MxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Mx, 
 		lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		prod_Mx = P.cwiseProduct(x);
 		return;
 	}
 private:
 	spmat_cd kernel;
 	Eigen::VectorXcd P;
 	int n_size;
 };
 TESTFUNC::TESTFUNC(int n)
 {
 	n_size = n;
 	kernel.resize(n_size, n_size);
 	kernel.setZero();
 	P.resize(n_size);
 }
 TESTFUNC::~TESTFUNC()
 {
 	kernel.resize(0, 0);
 }
 void TESTFUNC::set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size)
 {
 	std::vector<triplt_cd> val_triplt;
 	for (size_t i = 0; i < nz_size; i++)
 	{
 		val_triplt.push_back(triplt_cd(row_id[i], col_id[i], val[i]));
 	}
 	kernel.setFromTriplets(val_triplt.begin(), val_triplt.end());
 	return;
 }
 void TESTFUNC::set_p()
 {
 	for (size_t i = 0; i < n_size; i++)
 	{
 		P[i] = 1.0/kernel.coeff(i, i);
 	}
 	return;
 }
 int main(int argc, char const *argv[])
 {
 	std::string inputPath = "data/case_10K_cA";
 	std::string answerPath = "data/case_10K_cB";
 	int N;
 	int nz;
 	lcg_complex *A;
 	int *rowIdxA;
 	int *colIdxA;
 	lcg_complex *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	lcg_complex *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	TESTFUNC test(N);
 	test.set_kernel(rowIdxA, colIdxA, A, nz);
 	test.set_p();
 	Eigen::VectorXcd B, ANS;
 	B.resize(N);
 	ANS.resize(N);
 	for (size_t i = 0; i < N; i++)
 	{
 		B[i] = b[i];
 		ANS[i] = ans_x[i];
 	}
 	/********************准备工作完成************************/
 	clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-16;
 	self_para.abs_diff = 0;
 	test.set_clcg_parameter(self_para);
 	test.set_report_interval(10);
 	// 声明一组解
 	Eigen::VectorXcd m = Eigen::VectorXcd::Constant(N, std::complex<double>(0.0, 0.0));
 	test.Minimize(m, B, CLCG_BICG);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	m.setZero();
 	test.Minimize(m, B, CLCG_BICG_SYM);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	m.setZero();
 	test.Minimize(m, B, CLCG_CGS);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	m.setZero();
 	test.Minimize(m, B, CLCG_TFQMR);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	m.setZero();
 	test.MinimizePreconditioned(m, B, CLCG_PCG);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	m.setZero();
 	test.MinimizePreconditioned(m, B, CLCG_PBICG);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	B.resize(0);
 	ANS.resize(0);
 	m.resize(0);
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	return 0;
 }
--- a/src/sample/sample7.cpp
+++ b/src/sample/sample7.cpp
@@ -0,0 +1,233 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include "iostream"
 #include "fstream"
 #include "../lib/solver_eigen.h"
 #include "../lib/preconditioner_eigen.h"
 typedef std::complex<double> complex_d;
 typedef Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> spmat_cd;
 typedef Eigen::Triplet<complex_d> triplt_cd;
 typedef Eigen::VectorXcd vector_cd;
 void read(std::string filePath, int *pN, int *pnz, complex_d **cooVal, 
 	int **cooRowIdx, int **cooColIdx, complex_d **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new complex_d[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new complex_d[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(complex_d));
 	}
 	in.read((char*)(*b), sizeof(complex_d)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, complex_d **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new complex_d[*pN]{};
 	in.read((char*)(*x), sizeof(complex_d)*(*pN));
    return;
 }
 double max_diff(const vector_cd &a, const vector_cd &b)
 {
 	double max = -1;
 	complex_d t;
 	for (int i = 0; i < a.size(); i++)
 	{
 		t = a[i] - b[i];
 		max = lcg_max(std::sqrt(std::norm(t)), max);
 	}
 	return max;
 }
 class TESTFUNC : public CLCG_EIGEN_Solver
 {
 public:
 	TESTFUNC(int n);
 	~TESTFUNC();
 	void set_kernel(int *row_id, int *col_id, complex_d *val, int nz_size);
 	void set_preconditioner();
 	//定义共轭梯度中Ax的算法
 	void AxProduct(const vector_cd &x, vector_cd &prod_Ax, lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		if (conjugate == Conjugate) prod_Ax = kernel.conjugate() * x;
 		else prod_Ax = kernel * x;
 		return;
 	}
 	void MxProduct(const vector_cd &x, vector_cd &prod_Mx, lcg_matrix_e layout, clcg_complex_e conjugate)
 	{
 		// No preconditioning
 		//prod_Mx = x;
 		// Preconditioning using the diagonal kernel
 		//prod_Mx = p.cwiseProduct(x);
 		// Preconditioning using the ILUT/IC
 		clcg_solve_lower_triangle(l_tri, x, p);
 		clcg_solve_upper_triangle(u_tri, p, prod_Mx);
 		return;
 	}
 private:
 	// 普通二维数组做核矩阵
 	spmat_cd kernel, l_tri, u_tri;
 	vector_cd p;
 	int n_size;
 };
 TESTFUNC::TESTFUNC(int n)
 {
 	n_size = n;
 	kernel.resize(n_size, n_size);
 	kernel.setZero();
 	p.resize(n_size);
 }
 TESTFUNC::~TESTFUNC()
 {
 	kernel.resize(0, 0);
 	l_tri.resize(0, 0);
 	u_tri.resize(0, 0);
 	p.resize(0);
 }
 void TESTFUNC::set_kernel(int *row_id, int *col_id, complex_d *val, int nz_size)
 {
 	std::vector<triplt_cd> val_triplt;
 	for (size_t i = 0; i < nz_size; i++)
 	{
 		val_triplt.push_back(triplt_cd(row_id[i], col_id[i], val[i]));
 	}
 	kernel.setFromTriplets(val_triplt.begin(), val_triplt.end());
 	return;
 }
 void TESTFUNC::set_preconditioner()
 {
 	// 1 Preconditioning using the incomplete LU decomposition
 	/*
 	for (size_t i = 0; i < n_size; i++)
 	{
 		p[i] = 1.0/kernel.coeff(i, i);
 	}
 	*/
 	// 2. Preconditioning using the incomplete LU decomposition
 	//incomplete_LU(kernel, l_tri, u_tri);
 	// 3. Preconditioning using the incomplete Cholesky decomposition
 	clcg_incomplete_Cholesky(kernel, l_tri);
 	u_tri = l_tri.transpose();
 	// 4. Preconditioning using compressed incomplete decompositions
 	/*
 	vector_cd one = Eigen::VectorXcd::Ones(n_size);
 	vector_cd x = Eigen::VectorXcd::Zero(n_size);
 	solve_lower_triangle(l_tri, one, x);
 	solve_upper_triangle(u_tri, x, p);
 	*/
 	return;
 }
 int main(int argc, char const *argv[]) try
 {
 	std::string inputPath = "data/case_1K_cA";
 	std::string answerPath = "data/case_1K_cB";
 	int N;
 	int nz;
 	complex_d *A;
 	int *rowIdxA;
 	int *colIdxA;
 	complex_d *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	complex_d *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	TESTFUNC test(N);
 	test.set_kernel(rowIdxA, colIdxA, A, nz);
 	test.set_preconditioner();
 	vector_cd B, ANS;
 	B.resize(N);
 	ANS.resize(N);
 	for (size_t i = 0; i < N; i++)
 	{
 		B[i] = b[i];
 		ANS[i] = ans_x[i];
 	}
 	/********************准备工作完成************************/
 	clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-12;
 	self_para.abs_diff = 0;
 	test.set_clcg_parameter(self_para);
 	test.set_report_interval(10);
 	Eigen::VectorXcd m = Eigen::VectorXcd::Constant(N, std::complex<double>(0.0, 0.0));
 	test.MinimizePreconditioned(m, B, CLCG_PCG);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	m.setZero();
 	test.MinimizePreconditioned(m, B, CLCG_PBICG);
 	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
 	ANS.resize(0);
 	B.resize(0);
 	m.resize(0);
 	return 0;
 }
 catch (std::exception &e)
 {
 	std::cerr << e.what() << std::endl;
 }
--- a/src/sample/sample8.cu
+++ b/src/sample/sample8.cu
@@ -0,0 +1,312 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/lcg_cuda.h"
 void read(std::string filePath, int *pN, int *pnz, double **cooVal,
 	int **cooRowIdx, int **cooColIdx, double **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new double[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new double[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(double));
 	}
 	in.read((char*)(*b), sizeof(double)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, double **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new double[*pN]{};
 	in.read((char*)(*x), sizeof(double)*(*pN));
    return;
 }
 lcg_float avg_error(lcg_float *a, lcg_float *b, int n)
 {
 	lcg_float avg = 0.0;
 	for (size_t i = 0; i < n; i++)
 	{
 		avg += (a[i] - b[i])*(a[i] - b[i]);
 	}
 	return sqrt(avg)/n;
 }
 // Declare as global variables
 lcg_float one = 1.0;
 lcg_float zero = 0.0;
 void *d_buf;
 cusparseSpMatDescr_t smat_A;
 int *d_rowIdxA; // COO
 int *d_rowPtrA; // CSR
 int *d_colIdxA;
 double *d_A;
 double *d_pd;
 double *d_ic;
 cusparseMatDescr_t descr_A = 0;
 cusparseMatDescr_t descr_L = 0;
 csric02Info_t icinfo_A = 0;
 csrsv2Info_t info_L = 0;
 csrsv2Info_t info_LT = 0;
 void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
 {
 	// Calculate the product of A*x
 	cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
    return;
 }
 void cudaMx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
 {
 	void *d_x, *d_Ax;
 	cusparseDnVecGetValues(x, &d_x);
 	cusparseDnVecGetValues(prod_Ax, &d_Ax);
 	cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (double*) d_x, (double*) d_pd, 
 		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, 
 		n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (double*) d_pd, (double*) d_Ax, 
 		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
    return;
 }
 int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, 
 	const lcg_para* param, const int n_size, const int nz_size, const int k)
 {
    if (converge <= param->epsilon) {
 		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
 	}
 	return 0;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_10K_A";
 	std::string answerPath = "data/case_10K_B";
 	int N;
 	int nz;
 	double *A;
 	int *rowIdxA;
 	int *colIdxA;
 	double *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	double *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	// Create handles
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	// Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(double));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_pd, N * sizeof(double));
 	cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
 	// This is just used to get bufferSize;
 	cusparseDnVecDescr_t dvec_tmp;
 	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_R_64F);
 	size_t bufferSize_B;
 	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
 	// --- Start of the preconditioning part ---
 	// Copy A
 	cudaMalloc(&d_ic, nz * sizeof(lcg_float));
 	cudaMemcpy(d_ic, d_A, nz * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
 	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT;
 	bufferSize = bufferSize_B;
 	// create descriptor for matrix A
 	cusparseCreateMatDescr(&descr_A);
 	// initialize properties of matrix A
 	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
 	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
 	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
 	// create descriptor for matrix L
 	cusparseCreateMatDescr(&descr_L);
 	// initialize properties of matrix L
 	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
 	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT);
 	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
 	// Create empty info objects for incomplete-cholesky factorization
 	cusparseCreateCsric02Info(&icinfo_A);
 	cusparseCreateCsrsv2Info(&info_L);
 	cusparseCreateCsrsv2Info(&info_LT);
 	// Compute buffer size in computing ic factorization
 	cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
 		d_colIdxA, icinfo_A, &bufferSize_A);
 	cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
 	cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT);
 	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT);
 	cudaMalloc(&d_buf, bufferSize);
 	// Perform incomplete-choleskey factorization: analysis phase
 	cusparseDcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, 
 		d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
 		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	// Perform incomplete-choleskey factorization: solve phase
 	cusparseDcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA, 
 		icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
 	// --- End of the preconditioning part ---
 	// Declare an initial solution
    lcg_para self_para = lcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	int ret;
 	double *host_m = new double[N];
 	// Solve with CG
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i] = 0.0;
 	}
    ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Solve with CGS
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i] = 0.0;
 	}
 	ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Solve with PCG
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i] = 0.0;
 	}
 	ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_PCG);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	delete[] host_m;
 	// Free Device memory
 	cudaFree(d_A);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
 	cudaFree(d_pd);
 	cudaFree(d_ic);
 	cusparseDestroyDnVec(dvec_tmp);
 	cusparseDestroySpMat(smat_A);
 	cudaFree(d_buf);
 	cusparseDestroyMatDescr(descr_A);
 	cusparseDestroyMatDescr(descr_L);
 	cusparseDestroyCsric02Info(icinfo_A);
 	cusparseDestroyCsrsv2Info(info_L);
 	cusparseDestroyCsrsv2Info(info_LT);
 	// Free handles
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }
--- a/src/sample/sample9.cu
+++ b/src/sample/sample9.cu
@@ -0,0 +1,221 @@
 /******************************************************
 * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
 * 
 * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
 * 
 * LibLCG is distributed under a dual licensing scheme. You can
 * redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License (LGPL) as published by the Free Software Foundation,
 * either version 2 of the License, or (at your option) any later version. 
 * You should have received a copy of the GNU Lesser General Public 
 * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
 * 
 * If the terms and conditions of the LGPL v.2. would prevent you from
 * using the LibLCG, please consider the option to obtain a commercial
 * license for a fee. These licenses are offered by the LibLCG developing 
 * team. As a rule, licenses are provided "as-is", unlimited in time for 
 * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
 * Please do not forget to include some description of your company and the 
 * realm of its activities. Also add information on how to contact you by 
 * electronic and paper mail.
 ******************************************************/
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cmath>
 #include "../lib/clcg_cuda.h"
 void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
 	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	in.read((char*)pnz, sizeof(int));
 	*cooVal = new cuDoubleComplex[*pnz]{};
 	*cooRowIdx = new int[*pnz]{};
 	*cooColIdx = new int[*pnz]{};
 	*b = new cuDoubleComplex[*pN]{};
 	for (int i = 0; i < *pnz; ++i)
 	{
 		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
 		in.read((char*)&(*cooColIdx)[i], sizeof(int));
 		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
 	}
 	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
 {
 	std::ifstream in(filePath, std::ios::binary);
 	in.read((char*)pN, sizeof(int));
 	*x = new cuDoubleComplex[*pN]{};
 	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
    return;
 }
 lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
 {
 	lcg_float avg = 0.0;
 	cuDoubleComplex tmp;
 	for (size_t i = 0; i < n; i++)
 	{
 		tmp = clcg_Zdiff(a[i], b[i]);
 		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
 	}
 	return sqrt(avg)/n;
 }
 // Declare as global variables
 cuDoubleComplex one, zero;
 void *d_buf;
 cusparseSpMatDescr_t smat_A;
 int *d_rowIdxA; // COO
 int *d_rowPtrA; // CSR
 int *d_colIdxA;
 cuDoubleComplex *d_A;
 cuDoubleComplex *d_B;
 void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
 	cusparseOperation_t oper_t)
 {
 	one.x = 1.0; one.y = 0.0;
 	zero.x = 0.0; zero.y = 0.0;
 	// Calculate the product of A*x
 	cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
    return;
 }
 int cudaProgress(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
 	const clcg_para* param, const int n_size, const int nz_size, const int k)
 {
    if (converge <= param->epsilon) {
 		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
 	}
 	return 0;
 }
 int main(int argc, char **argv)
 {
 	std::string inputPath = "data/case_1K_cA";
 	std::string answerPath = "data/case_1K_cB";
 	int N, nz;
 	int *rowIdxA, *colIdxA;
 	cuDoubleComplex *A, *b;
 	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
 	cuDoubleComplex *ans_x;
 	readAnswer(answerPath, &N, &ans_x);
 	std::clog << "N = " << N << std::endl;
 	std::clog << "nz = " << nz << std::endl;
 	// Create handles
 	cublasHandle_t cubHandle;
 	cusparseHandle_t cusHandle;
 	cublasCreate(&cubHandle);
 	cusparseCreate(&cusHandle);
 	// Allocate GPU memory & copy matrix/vector to device
 	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
 	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
 	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
 	cudaMalloc(&d_colIdxA, nz * sizeof(int));
 	cudaMalloc(&d_B, N * sizeof(cuDoubleComplex));
 	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
 	// Convert matrix A from COO format to CSR format
 	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
 	// Create sparse matrix
 	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
 		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
 	// This is just used to get bufferSize;
 	cusparseDnVecDescr_t dvec_tmp;
 	cusparseCreateDnVec(&dvec_tmp, N, d_B, CUDA_C_64F);
 	size_t bufferSize_B, bufferSize_B2;
 	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
 	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE, &one, smat_A,
 		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B2);
 	if (bufferSize_B2 > bufferSize_B) bufferSize_B = bufferSize_B2;
 	cudaMalloc(&d_buf, bufferSize_B);
 	// Declare an initial solution
    clcg_para self_para = clcg_default_parameters();
 	self_para.epsilon = 1e-6;
 	self_para.abs_diff = 0;
 	int ret;
 	cuDoubleComplex *host_m = new cuDoubleComplex[N];
 	// Solve with BICG
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
    ret = clcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, CLCG_BICG);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Solve with BICG_SYM
 	for (size_t i = 0; i < N; i++)
 	{
 		host_m[i].x = 0.0; host_m[i].y = 0.0;	
 	}
    ret = clcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, CLCG_BICG_SYM);
    lcg_error_str(ret);
 	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
 	// Free Host memory
 	delete[] A;
 	delete[] rowIdxA;
 	delete[] colIdxA;
 	delete[] b;
 	delete[] ans_x;
 	delete[] host_m;
 	// Free Device memory
 	cudaFree(d_A);
 	cudaFree(d_rowIdxA);
 	cudaFree(d_rowPtrA);
 	cudaFree(d_colIdxA);
 	cudaFree(d_B);
 	cusparseDestroyDnVec(dvec_tmp);
 	cusparseDestroySpMat(smat_A);
 	cudaFree(d_buf);
 	// Free handles
 	cublasDestroy(cubHandle);
 	cusparseDestroy(cusHandle);
 	return 0;
 }