initial upload

2024-09-11 13:39:28 +08:00
parent c7e8487a02
commit 834df92696
68 changed files with 21889 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-# ---> C++
 # Prerequisites
 *.d

@@ -32,3 +31,12 @@
 *.out
 *.app

+# folder preferences and build folder
+.DS_Store
+build/
+pack/
+.vscode/
+out/
+*.sh
+case_*
+config.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.15.2)
+# 设置工程名称
+project(LibLCG VERSION 3.1 LANGUAGES CXX)
+# 添加配置配件编写的函数
+include(CMakePackageConfigHelpers)
+
+message(STATUS "Platform: " ${CMAKE_HOST_SYSTEM_NAME})
+# CMake默认的安装路径 Windows下为C:/Program\ Files/${Project_Name} Linux/Unix下为/usr/local
+message(STATUS "Install prefix: " ${CMAKE_INSTALL_PREFIX})
+# CMake默认的变异类型为空
+message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
+
+# 添加编译选项
+option(LibLCG_OPENMP "Use OpenMP" ON) # Set OFF to disable the functionality 
+option(LibLCG_EIGEN "Use Eigen" ON)
+option(LibLCG_STD_COMPLEX "Use STD complex" ON)
+option(LibLCG_CUDA "Use CUDA" ON)
+message(STATUS "Use OpenMP: " ${LibLCG_OPENMP})
+message(STATUS "Use Eigen: " ${LibLCG_EIGEN})
+message(STATUS "Use STD complex: " ${LibLCG_STD_COMPLEX})
+message(STATUS "Use CUDA: " ${LibLCG_CUDA})
+
+# 加入一个头文件配置，让cmake对源码进行操作
+configure_file(
+	"${PROJECT_SOURCE_DIR}/config.h.in"
+	"${PROJECT_SOURCE_DIR}/src/lib/config.h"
+	)
+
+# 添加源文件地址
+add_subdirectory(src/)
--- a/2537
+++ b/2537
--- a/524
+++ b/524
@@ -0,0 +1,524 @@
+LibLCG License
+--------------
+
+LibLCG is distributed under a dual licensing scheme. You can
+redistribute it and/or modify it under the terms of the GNU Lesser
+General Public License (LGPL) as published by the Free Software 
+Foundation, either version 2 of the License, or (at your option) any 
+later version. A copy of the GNU Lesser General Public License is 
+reproduced below.
+
+If the terms and conditions of the LGPL v.2. would prevent you from
+using the LibLCG, please consider the option to obtain a commercial
+license for a fee. These licenses are offered by the LibLCG developing 
+team. As a rule, licenses are provided "as-is", unlimited in time for 
+a one time fee. Please send corresponding requests to:
+yizhang-geo@zju.edu.cn. Please do not forget to include some
+description of your company and the realm of its activities. Also add 
+information on how to contact you by electronic and paper mail.
+
+=====================================================================
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
+    USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random
+  Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
--- a/LibLCGConfig.cmake.in
+++ b/LibLCGConfig.cmake.in
@@ -0,0 +1,20 @@
+@PACKAGE_INIT@
+
+set(@PROJECT_NAME@_Version "@PROJECT_VERSION@")
+set_and_check(@PROJECT_NAME@_INSTALL_PREFIX "${PACKAGE_PREFIX_DIR}")
+set_and_check(@PROJECT_NAME@_INC_DIR "${PACKAGE_PREFIX_DIR}/include")
+set_and_check(@PROJECT_NAME@_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/include")
+set_and_check(@PROJECT_NAME@_LIB_DIR "${PACKAGE_PREFIX_DIR}/lib")
+set_and_check(@PROJECT_NAME@_LIBRARY_DIR "${PACKAGE_PREFIX_DIR}/lib")
+
+set(@PROJECT_NAME@_LIB lcg)
+set(@PROJECT_NAME@_LIBRARY lcg)
+set(@PROJECT_NAME@_FOUND 1)
+
+set(@PROJECT_NAME@_OPENMP @LibLCG_OPENMP@)
+set(@PROJECT_NAME@_EIGEN @LibLCG_EIGEN@)
+set(@PROJECT_NAME@_STD_COMPLEX @LibLCG_STD_COMPLEX@)
+set(@PROJECT_NAME@_CUDA @LibLCG_CUDA@)
+
+# include target information
+include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
--- a/README.md
+++ b/README.md
@@ -1,2 +1,225 @@
-# liblcg
+# C++ Library of the Linear Conjugate Gradient Methods (LibLCG) 说明文档

+张壹（yizhang-geo@zju.edu.cn）
+
+_浙江大学地球科学学院·地球物理研究所_
+
+**此说明仅覆盖算法库的简单介绍及使用，更详细的内容请查看代码注释。如果还有问题，请发邮件联系我。同时也欢迎有兴趣的同学加入开发团队！**
+
+## 简介
+
+liblcg 是一个高效的、可扩展的 C++ 线性共轭梯度算法库，在原生数据结构接口的基础上，同时提供基于Eigen3和CUDA的算法接口，可以方便的实现基于CPU或GPU并行的加速计算，其中基于Eigen3的算法包含了稠密与稀疏矩阵的实现，而基于CUDA的算法主要为稀疏矩阵的实现。liblcg 包含多种实数与复数域的共轭梯度算法与其他一些迭代求解方法。目前已有得方法包括共轭梯度法、预优的共轭梯度算法、共轭梯度平方算法、双稳共轭梯度算法、BB步共轭梯度投影法与SPG共轭梯度投影法；复数域的双共轭梯度法、共轭梯度平方法、预优的共轭梯度法与TFQMR法。共轭梯度法广泛应用于无约束与不等式约束的线性最优化问题，拥有优良的收敛与计算效率。
+
+共轭梯度算法可用于求解如下形式的线性方程组：
+
+```
+Ax = B
+```
+
+其中，A 是一个 N 阶的方阵、x 为 N\*1 大小的待求解的模型向量，B 为 N\*1 大小的需拟合的目标向量。需要注意的是，不同种类的共轭梯度算法对A可能有不同的要求，比如必须是正定的，或者对称的。不同算法的具体要求可以查阅其他参考文献或者查看代码中的注释。
+
+## 安装
+
+算法库使用 CMake 工具进行汇编，可在不同操作平台生成相应的Makefile或工程文件。
+
+### 编译选项
+
+算法库目前可用的编译选项有：
+* LibLCG_OPENMP：是否使用OpenMP进行加速，需要安装OpeMP。默认为ON。
+* LibLCG_EIGEN：是否编译基于Eigen的算法与借口，需要安装Eigen。默认为ON。
+* LibLCG_STD_COMPLEX：是否使用std::complex\<double\>作为复数的默认类型。默认为ON。
+* LibLCG_CUDA：是否编译基于CUDA的算法与借口，需要安装CUDA。默认为ON。
+
+用户可以使用cmake命令中的-D选项对编译选项进行设置，比如关闭LibLCG_Eigen：
+
+```shell
+cmake -DLibLCG_EIGEN=OFF
+```
+
+### Linux 与 MacOS
+
+liblcg的默认安装路径为 /usr/local。头文件与动态库分别安装于 include 与 lib 文件夹。具体的编译与安装步骤如下：
+
+1. 下载安装CMake软件；
+2. 下载安装GCC编译器（常见系统已内置）；
+3. 在源文件路径内使用如下命令进行编译与安装：
+
+```shell
+mkdir build && cd build && cmake .. && make install
+```
+
+### Windows
+
+#### MinGW 和 GCC
+
+Windows系统不包含GNU编译环境，用户需自行下载并配置。方法如下：
+
+1. 下载MinGW安装文件，并选择gcc、pthreads与make相关软件包安装；
+2. 下载安装CMake软件；
+3. 添加CMake与MinGW可执行文件路径至Windows环境变量；
+4. 在源文件路径内使用如下命令进行编译与安装：
+
+```shell
+mkdir build && cd build && cmake .. -G "MinGW Makefiles" && make install
+```
+
+默认的安装路径为C:/Program\\ Files。头文件与动态库分别安装于 include 与 lib 文件夹。
+
+**注意：用户需要手动添加头文件与动态库地址到计算机的环境变量中。**
+
+#### Visual Studio
+
+用户可使用CMake工具构建VS工程文件并编译使用动态库。方法如下：
+
+1. 下载安装 Visual Studio 软件；
+2. 下载安装CMake软件；
+3. 在源文件路径内使用如下命令生成VS工程文件：
+
+```shell
+mkdir build && cd build && cmake .. -G "Visual Studio 16 2019"
+```
+
+_注：如需生成其他版本的VS工程文件，请使用-G命令查看相应的识别码。_
+
+4. 使用 Visual Studio 打开.sln工程文件并编译动态库。
+
+## 使用与编译
+
+用户使用库函数时需在源文件中引入相应的头文件，如：
+
+```cpp
+#include "lcg/lcg.h"
+```
+
+编译可执行文件时需链接lcg动态库。以g++为例：
+
+```shell
+g++ example.cpp -llcg -o example_out
+```
+
+## 快速开始
+
+要使用liblcg求解线性方程组Ax=B，用户需要定义Ax乘积的计算函数（回调函数），该函数的功能为计算不同的x所对应的乘积Ax。以实数类型的共轭梯度算法为例，其回调函数的接口定义为：
+
+```cpp
+typedef void (*lcg_axfunc_ptr)(void* instance, const lcg_float* x, lcg_float* prod_Ax, const int n_size);
+```
+
+其中，`x`为输入的向量，`prod_Ax`为返回的乘积向量，`n`为这两个向量的长度。注意此处参数列表中并不包含矩阵A，这意味这A必须为全局或者类变量。这样设计的主要原因是在某些复杂最优化问题的编程中，计算并存储A并不实际或者划算，此时一般采用的策略是存储相关变量且仅计算Ax的乘积，所以矩阵A并不总是存在。
+
+用户在定义Ax计算函数后即可调用求解函数 lcg_solver() 对线性方程组进行求解。以无约束的求解函数为例，其声明如下：
+
+```cpp
+int lcg_solver(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
+	const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_CGS);
+```
+
+其中：
+1. `lcg_axfunc_ptr Afp` 为正演计算的回调函数；
+2. `lcg_progress_ptr Pfp` 监控迭代过程的回调函数（非必须，无需监控时使用 nullptr 参数即可）；
+3. `lcg_float* m` 初始解向量，迭代取得的解也保存与此数组；
+4. `const lcg_float* B` Ax = B 中的 B 项；
+5. `const int n_size` 解向量的大小；
+6. `const lcg_para* param` 迭代使用的参数，此参数为 nullptr 即使用默认参数；
+7. `void* instance` 传入的实例对象, 此函数在类中使用即为类的 this 指针, 在普通函数中使用时即为 nullptr；
+8. `int solver_id` 求解函数使用的求解方法，具体的方法代号可查看对应的头文件；
+
+### 一个简单的例子
+
+```cpp
+#include "cmath"
+#include "iostream"
+#include "lcg/lcg.h"
+
+#define M 100
+#define N 80
+
+// 返回两个数组元素之间的最大差值
+lcg_float max_diff(const lcg_float *a, const lcg_float *b, int size)
+{
+	lcg_float max = -1;
+	for (int i = 0; i < size; i++)
+	{
+		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
+	}
+	return max;
+}
+
+// 普通二维数组做核矩阵
+lcg_float **kernel;
+// 中间结果数组
+lcg_float *tmp_arr;
+
+// 计算核矩阵乘向量的乘积 lcg_solver的回调函数
+void CalAx(void* instance, const lcg_float* x, lcg_float* prod_Ax, const int n_s)
+{
+    // 注意核矩阵实际为 kernel^T * kernel，大小为N*N
+	lcg_matvec(kernel, x, tmp_arr, M, n_s, MatNormal); // tmp_tar = kernel * x
+	lcg_matvec(kernel, tmp_arr, prod_Ax, M, n_s, MatTranspose); // prod_Ax = kernel^T * tmp_tar
+	return;
+}
+
+// 定义监控函数 lcg_solver的回调函数
+// 这个函数显示当前的迭代次数与收敛值
+int Prog(void* instance, const lcg_float* m, const lcg_float converge, const lcg_para* param, const int n_s, const int k)
+{
+	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	return 0;
+}
+
+int main(int argc, char const *argv[])
+{
+    // 开辟数组空间
+	kernel = lcg_malloc(M, N);
+	tmp_arr = lcg_malloc(M);
+
+    // 为核矩阵赋初值
+	lcg_vecrnd(kernel, -1.0, 1.0, M, N);
+
+	// 生成一组理论解
+	lcg_float *fm = lcg_malloc(N);
+	lcg_vecrnd(fm, 1.0, 2.0, N);
+
+	// 计算共轭梯度B项
+	lcg_float *B = lcg_malloc(N);
+	lcg_matvec(kernel, fm, tmp_arr, M, N, MatNormal);
+	lcg_matvec(kernel, tmp_arr, B, M, N, MatTranspose);
+
+	// 设置共轭梯度参数
+	lcg_para self_para = lcg_default_parameters();
+	self_para.epsilon = 1e-5;
+	self_para.abs_diff = 0;
+
+	// 声明一组解
+	lcg_float *m = lcg_malloc(N);
+	lcg_vecset(m, 0.0, N);
+
+	// 使用标准共轭梯度方法（LCG_CG）求解线性方程组
+    // 将回调函数传递给solver
+    // 由于回调函数为全局函数，因此instance变量的值为NULL
+	int ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_CG);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+    // 销毁数组
+	lcg_free(kernel, M);
+	lcg_free(tmp_arr);
+	lcg_free(fm);
+	lcg_free(B);
+	lcg_free(m);
+	return 0;
+}
+```
+
+**完整的例子储存在[sample](src/sample)文件夹内。**
+
+## 类模版
+
+liblcg为不同类型的共轭梯度算法定义了通用的求解类模版，包含了类中函数的指针代理及通用的监控函数实现，用户可直接继承并使用。需要注意的是这些类模版中定义了纯虚的函数接口，用户需要全部实现。其中没用到的定义成空函数就行了。以实数的求解类模版为例，需要实现的接口函数包括：
+
+```cpp
+void AxProduct(const lcg_float* a, lcg_float* b, const int num) = 0
+void MxProduct(const lcg_float* a, lcg_float* b, const int num) = 0
+```
+
+其中`AxProduct`是Ax的计算函数，`MxProduct`是预优过程的计算函数，即M^-1x。
--- a/config.h.in
+++ b/config.h.in
@@ -0,0 +1,4 @@
+#cmakedefine LibLCG_OPENMP
+#cmakedefine LibLCG_EIGEN
+#cmakedefine LibLCG_STD_COMPLEX
+#cmakedefine LibLCG_CUDA
--- a/data/README
+++ b/data/README
@@ -0,0 +1,11 @@
+case_*_A: Full symmetric matrix
+
+[ N (int) | nz (int) ]
+[ RowIdx (int) | ColIdx (int) | Val (double) ] * nz
+[ b (double) * N ]
+[ d (double) * N ] (complex matrix only)
+
+case_*_B: Vector
+
+[ N (int) ]
+[ x (double) * N]
--- a/data/cases.7z
+++ b/data/cases.7z
--- a/data/get_cdat.cpp
+++ b/data/get_cdat.cpp
@@ -0,0 +1,105 @@
+#include "../src/lib/lcg_complex.h"
+#include "iostream"
+#include "fstream"
+#include "vector"
+
+#include "Eigen/Sparse"
+
+#define random(x) (rand()%x)
+
+typedef Eigen::SparseMatrix<lcg_complex, Eigen::RowMajor> spmat_cd; // 注意Eigen默认的稀疏矩阵排序为列优先
+typedef Eigen::Triplet<lcg_complex> triplt_cd;
+
+int main(int argc, char const *argv[])
+{
+    int N = 1000000;
+    int nz = 1013000;
+
+    lcg_complex *v = new lcg_complex[nz];
+    lcg_complex *x = new lcg_complex[N];
+    lcg_complex *b = new lcg_complex[N];
+
+    lcg_complex one(1.0, 1.0), none(-1.0, -1.0), zero(0.0, 0.0);
+
+    clcg_vecrnd(v, 1.0*one, 10.0*one, nz);
+    clcg_vecrnd(x, 1.0*one, 2.0*one, N);
+    clcg_vecset(b, zero, N);
+
+    std::vector<triplt_cd> val_triplt;
+    val_triplt.reserve(2*(nz-N) + N);
+
+    for (size_t i = 0; i < N; i++)
+    {
+        val_triplt.push_back(triplt_cd(i, i, v[i]));
+        b[i] += v[i]*x[i];
+    }
+
+    srand((int)time(0));
+
+    int r, c;
+    size_t j = N;
+    while (j < nz)
+    {
+        r = random(N);
+        c = random(N);
+        if (r != c)
+        {
+            val_triplt.push_back(triplt_cd(r, c, v[j]));
+            val_triplt.push_back(triplt_cd(c, r, v[j]));
+
+            b[r] += v[j]*x[c];
+            b[c] += v[j]*x[r];
+            j++;
+        }
+    }
+
+    spmat_cd A;
+    A.resize(N, N);
+    A.setZero();
+    
+    A.setFromTriplets(val_triplt.begin(), val_triplt.end());
+
+    std::ofstream Aout, Bout;
+    Aout.open("case_1M_cA", std::ios::binary);
+    Bout.open("case_1M_cB", std::ios::binary);
+
+    lcg_complex tmp;
+    
+    nz = A.nonZeros();
+
+    Aout.write((char*)&N, sizeof(int));
+    Aout.write((char*)&nz, sizeof(int));
+    for (size_t i = 0; i < N; i++)
+    {
+        for (Eigen::SparseMatrix<lcg_complex, Eigen::RowMajor>::InnerIterator it(A, i); it; ++it) // 列循环
+        {
+            r = it.row();
+            c = it.col();
+            tmp = it.value();
+
+            Aout.write((char*)&r, sizeof(int));
+            Aout.write((char*)&c, sizeof(int));
+            Aout.write((char*)&tmp, sizeof(lcg_complex));
+        }
+    }
+
+    for (size_t i = 0; i < N; i++)
+    {
+        tmp = b[i];
+        Aout.write((char*)&tmp, sizeof(lcg_complex));
+    }
+    Aout.close();
+
+    Bout.write((char*)&N, sizeof(int));
+    for (size_t i = 0; i < N; i++)
+    {
+        tmp = x[i];
+        Bout.write((char*)&tmp, sizeof(lcg_complex));
+    }
+    Bout.close();
+
+    delete[] v;
+    delete[] x;
+    delete[] b;
+    return 0;
+}
--- a/doxy/doxygen.sty
+++ b/doxy/doxygen.sty
@@ -0,0 +1,577 @@
+% stylesheet for doxygen 1.8.17
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{doxygen}
+
+% Packages used by this style file
+\RequirePackage{alltt}
+%%\RequirePackage{array} %% moved to refman.tex due to workaround for LaTex 2019 version and unmaintained tabu package
+\RequirePackage{calc}
+\RequirePackage{float}
+%%\RequirePackage{ifthen} %% moved to refman.tex due to workaround for LaTex 2019 version and unmaintained tabu package
+\RequirePackage{verbatim}
+\RequirePackage[table]{xcolor}
+\RequirePackage{longtable_doxygen}
+\RequirePackage{tabu_doxygen}
+\RequirePackage{fancyvrb}
+\RequirePackage{tabularx}
+\RequirePackage{multirow}
+\RequirePackage{hanging}
+\RequirePackage{ifpdf}
+\RequirePackage{adjustbox}
+\RequirePackage{amssymb}
+\RequirePackage{stackengine}
+\RequirePackage[normalem]{ulem} % for strikeout, but don't modify emphasis
+
+%---------- Internal commands used in this style file ----------------
+
+\newcommand{\ensurespace}[1]{%
+  \begingroup%
+    \setlength{\dimen@}{#1}%
+    \vskip\z@\@plus\dimen@%
+    \penalty -100\vskip\z@\@plus -\dimen@%
+    \vskip\dimen@%
+    \penalty 9999%
+    \vskip -\dimen@%
+    \vskip\z@skip% hide the previous |\vskip| from |\addvspace|
+  \endgroup%
+}
+
+\newcommand{\DoxyHorRuler}[1]{%
+  \setlength{\parskip}{0ex plus 0ex minus 0ex}%
+  \ifthenelse{#1=0}%
+  {%
+    \hrule%
+  }%
+  {%
+    \hrulefilll%
+  }%
+}
+\newcommand{\DoxyLabelFont}{}
+\newcommand{\entrylabel}[1]{%
+  {%
+    \parbox[b]{\labelwidth-4pt}{%
+      \makebox[0pt][l]{\DoxyLabelFont#1}%
+      \vspace{1.5\baselineskip}%
+    }%
+  }%
+}
+
+\newenvironment{DoxyDesc}[1]{%
+  \ensurespace{4\baselineskip}%
+  \begin{list}{}{%
+    \settowidth{\labelwidth}{20pt}%
+    %\setlength{\parsep}{0pt}%
+    \setlength{\itemsep}{0pt}%
+    \setlength{\leftmargin}{\labelwidth+\labelsep}%
+    \renewcommand{\makelabel}{\entrylabel}%
+  }%
+  \item[#1]%
+}{%
+  \end{list}%
+}
+
+\newsavebox{\xrefbox}
+\newlength{\xreflength}
+\newcommand{\xreflabel}[1]{%
+  \sbox{\xrefbox}{#1}%
+  \setlength{\xreflength}{\wd\xrefbox}%
+  \ifthenelse{\xreflength>\labelwidth}{%
+    \begin{minipage}{\textwidth}%
+      \setlength{\parindent}{0pt}%
+      \hangindent=15pt\bfseries #1\vspace{1.2\itemsep}%
+    \end{minipage}%
+  }{%
+   \parbox[b]{\labelwidth}{\makebox[0pt][l]{\textbf{#1}}}%
+  }%
+}
+
+%---------- Commands used by doxygen LaTeX output generator ----------
+
+% Used by <pre> ... </pre>
+\newenvironment{DoxyPre}{%
+  \small%
+  \begin{alltt}%
+}{%
+  \end{alltt}%
+  \normalsize%
+}
+% Necessary for redefining not defined characters, i.e. "Replacement Character" in tex output.
+\newlength{\CodeWidthChar}
+\newlength{\CodeHeightChar}
+\settowidth{\CodeWidthChar}{?}
+\settoheight{\CodeHeightChar}{?}
+% Necessary for hanging indent
+\newlength{\DoxyCodeWidth}
+
+\newcommand\DoxyCodeLine[1]{\hangpara{\DoxyCodeWidth}{1}{#1}\par}
+
+\newcommand\NiceSpace{%
+     \discretionary{}{\kern\fontdimen2\font}{\kern\fontdimen2\font}%
+}
+
+% Used by @code ... @endcode
+\newenvironment{DoxyCode}[1]{%
+  \par%
+  \scriptsize%
+  \normalfont\ttfamily%
+  \rightskip0pt plus 1fil%
+  \settowidth{\DoxyCodeWidth}{000000}%
+  \settowidth{\CodeWidthChar}{?}%
+  \settoheight{\CodeHeightChar}{?}%
+  \setlength{\parskip}{0ex plus 0ex minus 0ex}%
+  \ifthenelse{\equal{#1}{0}}
+  {
+    {\lccode`~32 \lowercase{\global\let~}\NiceSpace}\obeyspaces%
+  }
+  {
+    {\lccode`~32 \lowercase{\global\let~}}\obeyspaces%
+  }
+
+}{%
+  \normalfont%
+  \normalsize%
+  \settowidth{\CodeWidthChar}{?}%
+  \settoheight{\CodeHeightChar}{?}%
+}
+
+% Redefining not defined characters, i.e. "Replacement Character" in tex output.
+\def\ucr{\adjustbox{width=\CodeWidthChar,height=\CodeHeightChar}{\stackinset{c}{}{c}{-.2pt}{%
+   \textcolor{white}{\sffamily\bfseries\small ?}}{%
+   \rotatebox{45}{$\blacksquare$}}}}
+
+% Used by @example, @include, @includelineno and @dontinclude
+\newenvironment{DoxyCodeInclude}[1]{%
+	\DoxyCode{#1}%
+}{%
+  \endDoxyCode%
+}
+
+% Used by @verbatim ... @endverbatim
+\newenvironment{DoxyVerb}{%
+  \footnotesize%
+  \verbatim%
+}{%
+  \endverbatim%
+  \normalsize%
+}
+
+% Used by @verbinclude
+\newenvironment{DoxyVerbInclude}{%
+  \DoxyVerb%
+}{%
+  \endDoxyVerb%
+}
+
+% Used by numbered lists (using '-#' or <ol> ... </ol>)
+\newenvironment{DoxyEnumerate}{%
+  \enumerate%
+}{%
+  \endenumerate%
+}
+
+% Used by bullet lists (using '-', @li, @arg, or <ul> ... </ul>)
+\newenvironment{DoxyItemize}{%
+  \itemize%
+}{%
+  \enditemize%
+}
+
+% Used by description lists (using <dl> ... </dl>)
+\newenvironment{DoxyDescription}{%
+  \description%
+}{%
+  \enddescription%
+}
+
+% Used by @image, @dotfile, @dot ... @enddot, and @msc ... @endmsc
+% (only if caption is specified)
+\newenvironment{DoxyImage}{%
+  \begin{figure}[H]%
+    \begin{center}%
+}{%
+    \end{center}%
+  \end{figure}%
+}
+
+% Used by @image, @dotfile, @dot ... @enddot, and @msc ... @endmsc
+% (only if no caption is specified)
+\newenvironment{DoxyImageNoCaption}{%
+  \begin{center}%
+}{%
+  \end{center}%
+}
+
+% Used by @image
+% (only if inline is specified)
+\newenvironment{DoxyInlineImage}{%
+}{%
+}
+
+% Used by @attention
+\newenvironment{DoxyAttention}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @author and @authors
+\newenvironment{DoxyAuthor}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @date
+\newenvironment{DoxyDate}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @invariant
+\newenvironment{DoxyInvariant}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @note
+\newenvironment{DoxyNote}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @post
+\newenvironment{DoxyPostcond}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @pre
+\newenvironment{DoxyPrecond}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @copyright
+\newenvironment{DoxyCopyright}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @remark
+\newenvironment{DoxyRemark}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @return and @returns
+\newenvironment{DoxyReturn}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @since
+\newenvironment{DoxySince}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @see
+\newenvironment{DoxySeeAlso}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @version
+\newenvironment{DoxyVersion}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @warning
+\newenvironment{DoxyWarning}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by @internal
+\newenvironment{DoxyInternal}[1]{%
+  \paragraph*{#1}%
+}{%
+}
+
+% Used by @par and @paragraph
+\newenvironment{DoxyParagraph}[1]{%
+  \begin{DoxyDesc}{#1}%
+}{%
+  \end{DoxyDesc}%
+}
+
+% Used by parameter lists
+\newenvironment{DoxyParams}[2][]{%
+    \tabulinesep=1mm%
+    \par%
+    \ifthenelse{\equal{#1}{}}%
+      {\begin{longtabu*}spread 0pt [l]{|X[-1,l]|X[-1,l]|}}% name + description
+    {\ifthenelse{\equal{#1}{1}}%
+      {\begin{longtabu*}spread 0pt [l]{|X[-1,l]|X[-1,l]|X[-1,l]|}}% in/out + name + desc
+      {\begin{longtabu*}spread 0pt [l]{|X[-1,l]|X[-1,l]|X[-1,l]|X[-1,l]|}}% in/out + type + name + desc
+    }
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #2}\\[1ex]%
+    \hline%
+    \endfirsthead%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #2}\\[1ex]%
+    \hline%
+    \endhead%
+}{%
+    \end{longtabu*}%
+    \vspace{6pt}%
+}
+
+% Used for fields of simple structs
+\newenvironment{DoxyFields}[1]{%
+    \tabulinesep=1mm%
+    \par%
+    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|X[-1,l]|}%
+    \multicolumn{3}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endfirsthead%
+    \multicolumn{3}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endhead%
+}{%
+    \end{longtabu*}%
+    \vspace{6pt}%
+}
+
+% Used for fields simple class style enums
+\newenvironment{DoxyEnumFields}[1]{%
+    \tabulinesep=1mm%
+    \par%
+    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endfirsthead%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endhead%
+}{%
+    \end{longtabu*}%
+    \vspace{6pt}%
+}
+
+% Used for parameters within a detailed function description
+\newenvironment{DoxyParamCaption}{%
+  \renewcommand{\item}[2][]{\\ \hspace*{2.0cm} ##1 {\em ##2}}% 
+}{%
+}
+
+% Used by return value lists
+\newenvironment{DoxyRetVals}[1]{%
+    \tabulinesep=1mm%
+    \par%
+    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endfirsthead%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endhead%
+}{%
+    \end{longtabu*}%
+    \vspace{6pt}%
+}
+
+% Used by exception lists
+\newenvironment{DoxyExceptions}[1]{%
+    \tabulinesep=1mm%
+    \par%
+    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endfirsthead%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endhead%
+}{%
+    \end{longtabu*}%
+    \vspace{6pt}%
+}
+
+% Used by template parameter lists
+\newenvironment{DoxyTemplParams}[1]{%
+    \tabulinesep=1mm%
+    \par%
+    \begin{longtabu*}spread 0pt [l]{|X[-1,r]|X[-1,l]|}%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endfirsthead%
+    \multicolumn{2}{l}{\hspace{-6pt}\bfseries\fontseries{bc}\selectfont\color{darkgray} #1}\\[1ex]%
+    \hline%
+    \endhead%
+}{%
+    \end{longtabu*}%
+    \vspace{6pt}%
+}
+
+% Used for member lists
+\newenvironment{DoxyCompactItemize}{%
+  \begin{itemize}%
+    \setlength{\itemsep}{-3pt}%
+    \setlength{\parsep}{0pt}%
+    \setlength{\topsep}{0pt}%
+    \setlength{\partopsep}{0pt}%
+}{%
+  \end{itemize}%
+}
+
+% Used for member descriptions
+\newenvironment{DoxyCompactList}{%
+  \begin{list}{}{%
+    \setlength{\leftmargin}{0.5cm}%
+    \setlength{\itemsep}{0pt}%
+    \setlength{\parsep}{0pt}%
+    \setlength{\topsep}{0pt}%
+    \renewcommand{\makelabel}{\hfill}%
+  }%
+}{%
+  \end{list}%
+}
+
+% Used for reference lists (@bug, @deprecated, @todo, etc.)
+\newenvironment{DoxyRefList}{%
+  \begin{list}{}{%
+    \setlength{\labelwidth}{10pt}%
+    \setlength{\leftmargin}{\labelwidth}%
+    \addtolength{\leftmargin}{\labelsep}%
+    \renewcommand{\makelabel}{\xreflabel}%
+  }%
+}{%
+  \end{list}%
+}
+
+% Used by @bug, @deprecated, @todo, etc.
+\newenvironment{DoxyRefDesc}[1]{%
+  \begin{list}{}{%
+    \renewcommand\makelabel[1]{\textbf{##1}}%
+    \settowidth\labelwidth{\makelabel{#1}}%
+    \setlength\leftmargin{\labelwidth+\labelsep}%
+  }%
+}{%
+  \end{list}%
+}
+
+% Used by parameter lists and simple sections
+\newenvironment{Desc}
+{\begin{list}{}{%
+    \settowidth{\labelwidth}{20pt}%
+    \setlength{\parsep}{0pt}%
+    \setlength{\itemsep}{0pt}%
+    \setlength{\leftmargin}{\labelwidth+\labelsep}%
+    \renewcommand{\makelabel}{\entrylabel}%
+  }
+}{%
+  \end{list}%
+}
+
+% Used by tables
+\newcommand{\PBS}[1]{\let\temp=\\#1\let\\=\temp}%
+\newenvironment{TabularC}[1]%
+{\tabulinesep=1mm
+\begin{longtabu*}spread 0pt [c]{*#1{|X[-1]}|}}%
+{\end{longtabu*}\par}%
+
+\newenvironment{TabularNC}[1]%
+{\begin{tabu}spread 0pt [l]{*#1{|X[-1]}|}}%
+{\end{tabu}\par}%
+
+% Used for member group headers
+\newenvironment{Indent}{%
+  \begin{list}{}{%
+    \setlength{\leftmargin}{0.5cm}%
+  }%
+  \item[]\ignorespaces%
+}{%
+  \unskip%
+  \end{list}%
+}
+
+% Used when hyperlinks are turned off
+\newcommand{\doxyref}[3]{%
+  \textbf{#1} (\textnormal{#2}\,\pageref{#3})%
+}
+
+% Used to link to a table when hyperlinks are turned on
+\newcommand{\doxytablelink}[2]{%
+  \ref{#1}%
+}
+
+% Used to link to a table when hyperlinks are turned off
+\newcommand{\doxytableref}[3]{%
+  \ref{#3}%
+}
+
+% Used by @addindex
+\newcommand{\lcurly}{\{}
+\newcommand{\rcurly}{\}}
+
+% Colors used for syntax highlighting
+\definecolor{comment}{rgb}{0.5,0.0,0.0}
+\definecolor{keyword}{rgb}{0.0,0.5,0.0}
+\definecolor{keywordtype}{rgb}{0.38,0.25,0.125}
+\definecolor{keywordflow}{rgb}{0.88,0.5,0.0}
+\definecolor{preprocessor}{rgb}{0.5,0.38,0.125}
+\definecolor{stringliteral}{rgb}{0.0,0.125,0.25}
+\definecolor{charliteral}{rgb}{0.0,0.5,0.5}
+\definecolor{vhdldigit}{rgb}{1.0,0.0,1.0}
+\definecolor{vhdlkeyword}{rgb}{0.43,0.0,0.43}
+\definecolor{vhdllogic}{rgb}{1.0,0.0,0.0}
+\definecolor{vhdlchar}{rgb}{0.0,0.0,0.0}
+
+% Color used for table heading
+\newcommand{\tableheadbgcolor}{lightgray}%
+
+% Version of hypertarget with correct landing location
+\newcommand{\Hypertarget}[1]{\Hy@raisedlink{\hypertarget{#1}{}}}
+
+% possibility to have sections etc. be within the margins
+% unfortunately had to copy part of book.cls and add \raggedright
+\makeatletter
+\newcommand\doxysection{\@startsection {section}{1}{\z@}%
+                                   {-3.5ex \@plus -1ex \@minus -.2ex}%
+                                   {2.3ex \@plus.2ex}%
+                                   {\raggedright\normalfont\Large\bfseries}}
+\newcommand\doxysubsection{\@startsection{subsection}{2}{\z@}%
+                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
+                                     {1.5ex \@plus .2ex}%
+                                     {\raggedright\normalfont\large\bfseries}}
+\newcommand\doxysubsubsection{\@startsection{subsubsection}{3}{\z@}%
+                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
+                                     {1.5ex \@plus .2ex}%
+                                     {\raggedright\normalfont\normalsize\bfseries}}
+\newcommand\doxyparagraph{\@startsection{paragraph}{4}{\z@}%
+                                    {3.25ex \@plus1ex \@minus.2ex}%
+                                    {-1em}%
+                                    {\raggedright\normalfont\normalsize\bfseries}}
+\newcommand\doxysubparagraph{\@startsection{subparagraph}{5}{\parindent}%
+                                       {3.25ex \@plus1ex \@minus .2ex}%
+                                       {-1em}%
+                                      {\raggedright\normalfont\normalsize\bfseries}}
+\makeatother
+% Define caption that is also suitable in a table
+\makeatletter
+\def\doxyfigcaption{%
+\refstepcounter{figure}%
+\@dblarg{\@caption{figure}}}
+\makeatother
--- a/doxy/footer.tex
+++ b/doxy/footer.tex
@@ -0,0 +1,12 @@
+% Latex footer for doxygen 1.8.17
+%--- End generated contents ---
+
+% Index
+\backmatter
+\newpage
+\phantomsection
+\clearemptydoublepage
+\addcontentsline{toc}{chapter}{\indexname}
+\printindex
+
+\end{document}
--- a/doxy/header.tex
+++ b/doxy/header.tex
@@ -0,0 +1,174 @@
+% Latex header for doxygen 1.8.17
+\let\mypdfximage\pdfximage\def\pdfximage{\immediate\mypdfximage}\documentclass[twoside]{book}
+
+%% moved from doxygen.sty due to workaround for LaTex 2019 version and unmaintained tabu package
+\usepackage{ifthen}
+\ifx\requestedLaTeXdate\undefined
+\usepackage{array}
+\else
+\usepackage{array}[=2016-10-06]
+\fi
+%%
+% Packages required by doxygen
+\usepackage{fixltx2e}
+\usepackage{calc}
+\usepackage{doxygen}
+\usepackage{graphicx}
+\usepackage[utf8]{inputenc}
+\usepackage{makeidx}
+\usepackage{multicol}
+\usepackage{multirow}
+\PassOptionsToPackage{warn}{textcomp}
+\usepackage{textcomp}
+\usepackage[nointegrals]{wasysym}
+\usepackage[table]{xcolor}
+\usepackage{ifpdf,ifxetex}
+
+% Font selection
+\usepackage[T1]{fontenc}
+\usepackage[scaled=.90]{helvet}
+\usepackage{courier}
+\usepackage{amssymb}
+\usepackage{sectsty}
+\renewcommand{\familydefault}{\sfdefault}
+\allsectionsfont{%
+  \fontseries{bc}\selectfont%
+  \color{darkgray}%
+}
+\renewcommand{\DoxyLabelFont}{%
+  \fontseries{bc}\selectfont%
+  \color{darkgray}%
+}
+\newcommand{\+}{\discretionary{\mbox{\scriptsize$\hookleftarrow$}}{}{}}
+
+% Arguments of doxygenemoji:
+% 1) ':<text>:' form of the emoji, already "LaTeX"-escaped
+% 2) file with the name of the emoji without the .png extension
+% in case image exist use this otherwise use the ':<text>:' form
+\newcommand{\doxygenemoji}[2]{%
+  \IfFileExists{./#2.png}{\raisebox{-0.1em}{\includegraphics[height=0.9em]{./#2.png}}}{#1}%
+}
+% Page & text layout
+\usepackage{geometry}
+\geometry{%
+  a4paper,%
+  top=2.5cm,%
+  bottom=2.5cm,%
+  left=2.5cm,%
+  right=2.5cm%
+}
+\tolerance=750
+\hfuzz=15pt
+\hbadness=750
+\setlength{\emergencystretch}{15pt}
+\setlength{\parindent}{0cm}
+\newcommand{\doxynormalparskip}{\setlength{\parskip}{3ex plus 2ex minus 2ex}}
+\newcommand{\doxytocparskip}{\setlength{\parskip}{1ex plus 0ex minus 0ex}}
+\doxynormalparskip
+\makeatletter
+\renewcommand{\paragraph}{%
+  \@startsection{paragraph}{4}{0ex}{-1.0ex}{1.0ex}{%
+    \normalfont\normalsize\bfseries\SS@parafont%
+  }%
+}
+\renewcommand{\subparagraph}{%
+  \@startsection{subparagraph}{5}{0ex}{-1.0ex}{1.0ex}{%
+    \normalfont\normalsize\bfseries\SS@subparafont%
+  }%
+}
+\makeatother
+
+\makeatletter
+\newcommand\hrulefilll{\leavevmode\leaders\hrule\hskip 0pt plus 1filll\kern\z@}
+\makeatother
+
+% Headers & footers
+\usepackage{fancyhdr}
+\pagestyle{fancyplain}
+\fancyhead[LE]{\fancyplain{}{\bfseries\thepage}}
+\fancyhead[CE]{\fancyplain{}{}}
+\fancyhead[RE]{\fancyplain{}{\bfseries\leftmark}}
+\fancyhead[LO]{\fancyplain{}{\bfseries\rightmark}}
+\fancyhead[CO]{\fancyplain{}{}}
+\fancyhead[RO]{\fancyplain{}{\bfseries\thepage}}
+\fancyfoot[LE]{\fancyplain{}{}}
+\fancyfoot[CE]{\fancyplain{}{}}
+\fancyfoot[RE]{\fancyplain{}{\bfseries\scriptsize Generated by Doxygen }}
+\fancyfoot[LO]{\fancyplain{}{\bfseries\scriptsize Generated by Doxygen }}
+\fancyfoot[CO]{\fancyplain{}{}}
+\fancyfoot[RO]{\fancyplain{}{}}
+\renewcommand{\footrulewidth}{0.4pt}
+\renewcommand{\chaptermark}[1]{%
+  \markboth{#1}{}%
+}
+\renewcommand{\sectionmark}[1]{%
+  \markright{\thesection\ #1}%
+}
+
+% Indices & bibliography
+\usepackage{natbib}
+\usepackage[titles]{tocloft}
+\setcounter{tocdepth}{3}
+\setcounter{secnumdepth}{5}
+\makeindex
+
+\usepackage{newunicodechar}
+  \newunicodechar{⁻}{${}^{-}$}% Superscript minus
+  \newunicodechar{²}{${}^{2}$}% Superscript two
+  \newunicodechar{³}{${}^{3}$}% Superscript three
+
+% Hyperlinks (required, but should be loaded last)
+\ifpdf
+  \usepackage[pdftex,pagebackref=true]{hyperref}
+\else
+  \ifxetex
+    \usepackage[pagebackref=true]{hyperref}
+  \else
+    \usepackage[ps2pdf,pagebackref=true]{hyperref}
+  \fi
+\fi
+
+\hypersetup{%
+  colorlinks=true,%
+  linkcolor=blue,%
+  citecolor=blue,%
+  unicode%
+}
+
+% Custom commands
+\newcommand{\clearemptydoublepage}{%
+  \newpage{\pagestyle{empty}\cleardoublepage}%
+}
+
+\usepackage{caption}
+\captionsetup{labelsep=space,justification=centering,font={bf},singlelinecheck=off,skip=4pt,position=top}
+
+\usepackage{etoc}
+\etocsettocstyle{\doxytocparskip}{\doxynormalparskip}
+\renewcommand{\numberline}[1]{#1~}
+%===== C O N T E N T S =====
+
+\begin{document}
+
+% Titlepage & ToC
+\hypersetup{pageanchor=false,
+             bookmarksnumbered=true,
+             pdfencoding=unicode
+            }
+\pagenumbering{alph}
+\begin{titlepage}
+\vspace*{7cm}
+\begin{center}%
+{\Large C++ Library of the Linear Conjugate Gradient Methods (LibLCG)}\\
+\vspace*{1cm}
+{\large Yi Zhang}\\
+\end{center}
+\end{titlepage}
+\clearemptydoublepage
+\pagenumbering{roman}
+\tableofcontents
+\clearemptydoublepage
+\pagenumbering{arabic}
+\hypersetup{pageanchor=true}
+
+%--- Begin generated contents ---
--- a/refman.pdf
+++ b/refman.pdf
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,181 @@
+# 设定源文件文件夹
+aux_source_directory(lib LCGLIB_SRC)
+
+if(NOT LibLCG_EIGEN)
+	list(REMOVE_ITEM LCGLIB_SRC "lib/algebra_eigen.cpp")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/lcg_eigen.cpp")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/clcg_eigen.cpp")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/solver_eigen.cpp")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/preconditioner_eigen.cpp")
+endif()
+
+if(NOT LibLCG_CUDA)
+	list(REMOVE_ITEM LCGLIB_SRC "lib/algebra_cuda.cu")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/lcg_complex_cuda.cu")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/lcg_cuda.cu")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/clcg_cuda.cu")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/clcg_cuda_f.cu")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/solver_cuda.cu")
+	list(REMOVE_ITEM LCGLIB_SRC "lib/preconditioner_cuda.cu")
+endif()
+
+# 以下部分为库的编译
+# 注意目标名必须唯一 所以不能直接生成相同名称的动态库与静态库
+# 注意此处不必为目标名称添加lib前缀和相应后缀，cmake会自行添加
+add_library(lcg SHARED ${LCGLIB_SRC})
+# 首先添加静态库的生成命令
+add_library(lcg_static STATIC ${LCGLIB_SRC})
+# 设置静态库的输出名称从而获得与动态库名称相同的静态库
+set_target_properties(lcg_static PROPERTIES OUTPUT_NAME "lcg")
+# 设置输出目标属性以同时输出动态库与静态库
+set_target_properties(lcg PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+set_target_properties(lcg_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+if(LibLCG_CUDA)
+	set_target_properties(lcg PROPERTIES CUDA_ARCHITECTURES 70)
+	set_target_properties(lcg_static PROPERTIES CUDA_ARCHITECTURES 70)
+endif()
+# 设置动态库的版本号
+set_target_properties(lcg PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR})
+# 设置库文件的输出地址
+set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+
+# 设置编译选项
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+
+if(LibLCG_EIGEN)
+	find_package(Eigen3 REQUIRED)
+	if(EIGEN3_FOUND)
+		message(STATUS "Eigen3 Found.")
+		include_directories(${EIGEN3_INCLUDE_DIR})
+	endif()
+endif()
+
+if(LibLCG_CUDA)
+	enable_language(CUDA)
+	find_package(CUDA REQUIRED)
+	if(CUDA_FOUND)
+		message(STATUS "CUDA Found.")
+		include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+		find_library(CUBLAS_LIBRARY cublas ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+		find_library(CUSPARSE_LIBRARY cusparse ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+		find_library(CUSOLVER_LIBRARY cusolver ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+		target_link_libraries(lcg PUBLIC ${CUBLAS_LIBRARY})
+		target_link_libraries(lcg_static ${CUBLAS_LIBRARY})
+		target_link_libraries(lcg PUBLIC ${CUSPARSE_LIBRARY})
+		target_link_libraries(lcg_static ${CUSPARSE_LIBRARY})
+		target_link_libraries(lcg PUBLIC ${CUSOLVER_LIBRARY})
+		target_link_libraries(lcg_static ${CUSOLVER_LIBRARY})
+	endif()
+endif()
+
+if(LibLCG_OPENMP)
+	# 添加openmp的编译命令 设置编译选项
+	find_package(OpenMP REQUIRED)
+	if (OpenMP_CXX_FOUND)
+		message(STATUS "OpenMP Found.")
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+		set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
+		target_link_libraries(lcg PUBLIC OpenMP::OpenMP_CXX)
+		target_link_libraries(lcg_static OpenMP::OpenMP_CXX)
+	endif()
+endif()
+
+set(CONFIG_FILE_PATH lib/cmake/${PROJECT_NAME})
+
+configure_package_config_file(${PROJECT_SOURCE_DIR}/${PROJECT_NAME}Config.cmake.in 
+	${CMAKE_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+	INSTALL_DESTINATION ${CONFIG_FILE_PATH}
+	NO_CHECK_REQUIRED_COMPONENTS_MACRO)
+
+write_basic_package_version_file(${CMAKE_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+        VERSION ${PROJECT_VERSION}
+        COMPATIBILITY SameMajorVersion)
+
+# 库的安装命令
+if(WIN32)
+	install(TARGETS lcg DESTINATION lib)
+	install(TARGETS lcg_static DESTINATION lib)
+else()
+	install(TARGETS lcg lcg_static
+		EXPORT ${PROJECT_NAME}Targets
+		LIBRARY DESTINATION lib
+		ARCHIVE DESTINATION lib)
+	install(EXPORT ${PROJECT_NAME}Targets
+	        DESTINATION ${CONFIG_FILE_PATH})
+	install(FILES
+        ${CMAKE_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+        ${CMAKE_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+        DESTINATION ${CONFIG_FILE_PATH})
+endif()
+# 头文件安装命令
+install(FILES lib/config.h DESTINATION include/lcg)
+install(FILES lib/algebra.h DESTINATION include/lcg)
+install(FILES lib/lcg_complex.h DESTINATION include/lcg)
+install(FILES lib/util.h DESTINATION include/lcg)
+install(FILES lib/lcg.h DESTINATION include/lcg)
+install(FILES lib/clcg.h DESTINATION include/lcg)
+install(FILES lib/solver.h DESTINATION include/lcg)
+install(FILES lib/preconditioner.h DESTINATION include/lcg)
+
+if(LibLCG_CUDA)
+	install(FILES lib/algebra_cuda.h DESTINATION include/lcg)
+	install(FILES lib/lcg_complex_cuda.h DESTINATION include/lcg)
+	install(FILES lib/lcg_cuda.h DESTINATION include/lcg)
+	install(FILES lib/clcg_cuda.h DESTINATION include/lcg)
+	install(FILES lib/clcg_cudaf.h DESTINATION include/lcg)
+	install(FILES lib/solver_cuda.h DESTINATION include/lcg)
+	install(FILES lib/preconditioner_cuda.h DESTINATION include/lcg)
+endif()
+
+if(LibLCG_EIGEN)
+	install(FILES lib/algebra_eigen.h DESTINATION include/lcg)
+	install(FILES lib/lcg_eigen.h DESTINATION include/lcg)
+	install(FILES lib/clcg_eigen.h DESTINATION include/lcg)
+	install(FILES lib/solver_eigen.h DESTINATION include/lcg)
+	install(FILES lib/preconditioner_eigen.h DESTINATION include/lcg)
+endif()
+
+# 以下部分为例子程序的编译
+# 设置可执行文件的输出地址
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+# 例子的编译方法
+macro(add_sample name file)
+	# 添加可执行文件 命令行
+	add_executable(${name} sample/${file})
+	# 为安装文件添加动态库的搜索地址 在Windows下并没有什么用 直接忽略
+	set_target_properties(${name} PROPERTIES INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
+	# 链接动态库
+	target_link_libraries(${name} PUBLIC lcg)
+	# 设置CUDA
+	if(LibLCG_CUDA)
+		set_target_properties(${name} PROPERTIES CUDA_ARCHITECTURES 70)
+	endif()
+endmacro()
+
+add_sample(lcg_sample1 sample1.cpp)
+add_sample(lcg_sample2 sample2.cpp)
+add_sample(lcg_sample3 sample3.cpp)
+add_sample(lcg_sample4 sample4.cpp)
+
+if(LibLCG_EIGEN)
+	add_sample(lcg_sample5 sample5.cpp)
+	add_sample(lcg_sample7 sample7.cpp)
+	if(LibLCG_STD_COMPLEX)
+		add_sample(lcg_sample6 sample6.cpp) 
+	endif()
+endif()
+
+if(LibLCG_CUDA)
+	# The followings are not working for now due to CUDA 12+ compatibility issues. Check more later
+	#add_sample(lcg_sample8 sample8.cu)
+	#add_sample(lcg_sample9 sample9.cu)
+	#add_sample(lcg_sample10 sample10.cu)
+	#add_sample(lcg_sample11 sample11.cu)
+	#add_sample(lcg_sample12 sample12.cu)
+	#add_sample(lcg_sample13 sample13.cu)
+	#add_sample(lcg_sample14 sample14.cu)
+	add_sample(lcg_sample15 sample15.cu)
+endif()
--- a/src/lib/algebra.cpp
+++ b/src/lib/algebra.cpp
@@ -0,0 +1,222 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "ctime"
+#include "random"
+
+#include "algebra.h"
+
+#ifdef LibLCG_OPENMP
+#include "omp.h"
+#endif
+
+lcg_float lcg_abs(lcg_float a)
+{
+	if (a >= 0.0) return a;
+	return -1.0*a;
+}
+
+lcg_float lcg_max(lcg_float a, lcg_float b)
+{
+	if (a >= b) return a;
+	return b;
+}
+
+lcg_float lcg_min(lcg_float a, lcg_float b)
+{
+	if (a <= b) return a;
+	return b;
+}
+
+lcg_float lcg_set2box(lcg_float low, lcg_float hig, lcg_float a, 
+	bool low_bound, bool hig_bound)
+{
+	if (hig_bound && a >= hig) return hig;
+	if (!hig_bound && a >= hig) return (hig - 1e-16);
+	if (low_bound && a <= low) return low;
+	if (!low_bound && a <= low) return (low + 1e-16);
+	return a;
+}
+
+lcg_float* lcg_malloc(int n)
+{
+	lcg_float* x = new lcg_float [n];
+	return x;
+}
+
+lcg_float** lcg_malloc(int m, int n)
+{
+	lcg_float **x = new lcg_float* [m];
+	for (int i = 0; i < m; i++)
+	{
+		x[i] = new lcg_float [n];
+	}
+	return x;
+}
+
+void lcg_free(lcg_float* x)
+{
+	if (x != nullptr)
+	{
+		delete[] x;
+		x = nullptr;
+	}
+	return;
+}
+
+void lcg_free(lcg_float **x, int m)
+{
+	if (x != nullptr)
+	{
+		for (int i = 0; i < m; i++)
+		{
+			delete[] x[i];
+		}
+		delete[] x;
+		x = nullptr;
+	}
+	return;
+}
+
+void lcg_vecset(lcg_float *a, lcg_float b, int size)
+{
+	for (int i = 0; i < size; i++)
+	{
+		a[i] = b;
+	}
+	return;
+}
+
+void lcg_vecset(lcg_float **a, lcg_float b, int m, int n)
+{
+    for (int i = 0; i < m; ++i)
+    {
+        for (int j = 0; j < n; ++j)
+        {
+            a[i][j] = b;
+        }
+    }
+    return;
+}
+
+void lcg_vecrnd(lcg_float *a, lcg_float l, lcg_float h, int size)
+{
+	srand(time(nullptr));
+	for (int i = 0; i < size; i++)
+	{
+		a[i] = (h-l)*rand()*1.0/RAND_MAX + l;
+	}
+	return;
+}
+
+void lcg_vecrnd(lcg_float **a, lcg_float l, lcg_float h, int m, int n)
+{
+	srand(time(nullptr));
+	for (int i = 0; i < m; i++)
+	{
+		for (int j = 0; j < n; j++)
+		{
+			a[i][j] = (h-l)*rand()*1.0/RAND_MAX + l;	
+		}
+	}
+	return;
+}
+
+double lcg_squaredl2norm(lcg_float *a, int n)
+{
+	lcg_float sum = 0;
+	for (size_t i = 0; i < n; i++)
+	{
+		sum += a[i]*a[i];
+	}
+	return sum;
+}
+
+void lcg_dot(lcg_float &ret, const lcg_float *a, 
+	const lcg_float *b, int size)
+{
+	ret = 0.0;
+	for (int i = 0; i < size; i++)
+	{
+		ret += a[i]*b[i];
+	}
+	return;
+}
+
+void lcg_matvec(lcg_float **A, const lcg_float *x, lcg_float *Ax, 
+	int m_size, int n_size, lcg_matrix_e layout)
+{
+	int i, j;
+	if (layout == MatNormal)
+	{
+#pragma omp parallel for private (i, j) schedule(guided)
+		for (i = 0; i < m_size; i++)
+		{
+			Ax[i] = 0.0;
+			for (j = 0; j < n_size; j++)
+			{
+				Ax[i] += A[i][j]*x[j];
+			}
+		}
+		return;
+	}
+
+#pragma omp parallel for private (i, j) schedule(guided)
+	for (j = 0; j < n_size; j++)
+	{
+		Ax[j] = 0.0;
+		for (i = 0; i < m_size; i++)
+		{
+			Ax[j] += A[i][j]*x[i];
+		}
+	}
+	return;
+}
+
+void lcg_matvec_coo(const int *row, const int *col, const lcg_float *Mat, const lcg_float *V, lcg_float *p, int M, int N, int nz_size, bool pre_position)
+{
+	if (!pre_position)
+	{
+		for (size_t i = 0; i < M; i++)
+		{
+			p[i] = 0.0;
+		}
+
+		for (size_t i = 0; i < nz_size; i++)
+		{
+			p[row[i]] += Mat[i]*V[col[i]];
+		}
+	}
+	else
+	{
+		for (size_t i = 0; i < N; i++)
+		{
+			p[i] = 0.0;
+		}
+
+		for (size_t i = 0; i < nz_size; i++)
+		{
+			p[col[i]] += Mat[i]*V[row[i]];
+		}
+	}
+	return;
+}
--- a/src/lib/algebra.h
+++ b/src/lib/algebra.h
@@ -0,0 +1,219 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _ALGEBRA_H
+#define _ALGEBRA_H
+
+#include "config.h"
+
+/**
+ * @brief      Matrix layouts.
+ */
+enum lcg_matrix_e
+{
+	MatNormal,
+	MatTranspose,
+};
+
+/**
+ * @brief      Conjugate types for a complex number.
+ */
+enum clcg_complex_e
+{
+	NonConjugate,
+	Conjugate,
+};
+
+/**
+ * @brief      A simple definition of the float type we use here. 
+ * Easy to change in the future. Right now it is just an alias of double
+ */
+typedef double lcg_float;
+
+/**
+ * @brief      Return absolute value
+ *
+ * @param[in]  a     input value
+ *
+ * @return     The absolute value
+ */
+lcg_float lcg_abs(lcg_float a);
+
+/**
+ * @brief      Return the bigger value
+ *
+ * @param[in]  a     input value
+ * @param[in]  b     input value
+ *
+ * @return     The bigger value
+ */
+lcg_float lcg_max(lcg_float a, lcg_float b);
+
+/**
+ * @brief      Return the smaller value
+ *
+ * @param[in]  a     input value
+ * @param[in]  b     input value
+ *
+ * @return     The smaller value
+ */
+lcg_float lcg_min(lcg_float a, lcg_float b);
+
+/**
+ * @brief      Set the input value within a box constraint
+ *
+ * @param      a     low boundary
+ * @param      b     high boundary
+ * @param      in    input value
+ * @param      low_bound    Whether to include the low boundary value
+ * @param      hig_bound    Whether to include the high boundary value
+ *
+ * @return     box constrained value
+ */
+lcg_float lcg_set2box(lcg_float low, lcg_float hig, lcg_float a, 
+	bool low_bound = true, bool hig_bound = true);
+
+/**
+ * @brief      Locate memory for a lcg_float pointer type.
+ *
+ * @param[in]  n     Size of the lcg_float array.
+ *
+ * @return     Pointer of the array's location.
+ */
+lcg_float* lcg_malloc(int n);
+
+/**
+ * @brief      Locate memory for a lcg_float second pointer type.
+ *
+ * @param[in]  n     Size of the lcg_float array.
+ *
+ * @return     Pointer of the array's location.
+ */
+lcg_float** lcg_malloc(int m, int n);
+
+/**
+ * @brief      Destroy memory used by the lcg_float type array.
+ *
+ * @param      x     Pointer of the array.
+ */
+void lcg_free(lcg_float* x);
+
+/**
+ * @brief      Destroy memory used by the 2D lcg_float type array.
+ *
+ * @param      x     Pointer of the array.
+ */
+void lcg_free(lcg_float **x, int m);
+
+/**
+ * @brief      set a vector's value
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  b     initial value
+ * @param[in]  size  vector size
+ */
+void lcg_vecset(lcg_float *a, lcg_float b, int size);
+
+/**
+ * @brief      set a 2d vector's value
+ *
+ * @param      a     pointer of the matrix
+ * @param[in]  b     initial value
+ * @param[in]  m     row size of the matrix
+ * @param[in]  n     column size of the matrix
+ */
+void lcg_vecset(lcg_float **a, lcg_float b, int m, int n);
+
+/**
+ * @brief      set a vector using random values
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  l     the lower bound of random values
+ * @param[in]  h     the higher bound of random values
+ * @param[in]  size  size of the vector
+ */
+void lcg_vecrnd(lcg_float *a, lcg_float l, lcg_float h, int size);
+
+/**
+ * @brief      set a 2D vector using random values
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  l     the lower bound of random values
+ * @param[in]  h     the higher bound of random values
+ * @param[in]  m     row size of the vector
+ * @param[in]  n     column size of the vector
+ */
+void lcg_vecrnd(lcg_float **a, lcg_float l, lcg_float h, int m, int n);
+
+/**
+ * @brief    calculate the squared L2 norm of the input vector
+ * 
+ * @param a         pointer of the vector
+ * @param n         size of the vector
+ * @return double   L2 norm
+ */
+double lcg_squaredl2norm(lcg_float *a, int n);
+
+/**
+ * @brief      calculate dot product of two real vectors
+ *
+ * @param[in]  a       pointer of the vector a
+ * @param[in]  b       pointer of the vector b
+ * @param[in]  size    size of the vector
+ *
+ * @return     dot product
+ */
+void lcg_dot(lcg_float &ret, const lcg_float *a, const lcg_float *b, int size);
+
+/**
+ * @brief      calculate product of a real matrix and a vector
+ * 
+ * Different configurations:
+ * layout=Normal -> A
+ * layout=Transpose -> A^T
+ *
+ * @param      A          matrix A
+ * @param[in]  x          vector x
+ * @param      Ax         product of Ax
+ * @param[in]  m_size     row size of A
+ * @param[in]  n_size     column size of A
+ * @param[in]  layout     layout of A used for multiplication. Must be Normal or Transpose
+ */
+void lcg_matvec(lcg_float **A, const lcg_float *x, lcg_float *Ax, int m_size, int n_size, 
+	lcg_matrix_e layout = MatNormal);
+
+/**
+ * @brief     Calculate the product of a sparse matrix multipled by a vector. The matrix is stored in the COO format.
+ * 
+ * @param row             Row index of the input sparse matrix.
+ * @param col             Column index of the input sparse matrix.
+ * @param Mat             Non-zero values of the input sparse matrix.
+ * @param V               Multipler vector
+ * @param p               Output prodcut
+ * @param M               Row number of the sparse matrix
+ * @param N               Column number of the sparse matrix
+ * @param nz_size         Non-zero size of the matrix
+ * @param pre_position    If ture, the multipler is seen as a row vector. Otherwise, it is treated as a column vector.
+ */
+void lcg_matvec_coo(const int *row, const int *col, const lcg_float *Mat, const lcg_float *V, lcg_float *p, int M, int N, int nz_size, bool pre_position = false);
+
+#endif //_ALGEBRA_H
--- a/src/lib/algebra_cuda.cu
+++ b/src/lib/algebra_cuda.cu
@@ -0,0 +1,110 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "algebra_cuda.h"
+
+
+__global__ void lcg_set2box_cuda_device(const lcg_float *low, const lcg_float *hig, lcg_float *a, 
+    int n, bool low_bound, bool hig_bound)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		if (hig_bound && a[i] >= hig[i]) a[i] = hig[i];
+		if (!hig_bound && a[i] > hig[i]) a[i] = hig[i];
+		if (low_bound && a[i] <= low[i]) a[i] = low[i];
+		if (!low_bound && a[i] < low[i]) a[i] = low[i];
+	}
+	return;
+}
+
+__global__ void lcg_smDcsr_get_diagonal_device(const int *A_ptr, const int *A_col, const lcg_float *A_val, const int A_len, lcg_float *A_diag)
+{
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < A_len)
+	{
+		const int num_non0_row = A_ptr[i + 1] - A_ptr[i];
+
+		for (int j = 0; j < num_non0_row; j++)
+		{
+			if (A_col[j + A_ptr[i]] == i)
+			{
+				A_diag[i] = A_val[j + A_ptr[i]];
+				break;
+			}
+		}
+	}
+	return;
+}
+
+__global__ void lcg_vecMvecD_element_wise_device(const lcg_float *a, const lcg_float *b, lcg_float *c, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		c[i] = a[i] * b[i];
+	}
+	return;
+}
+
+__global__ void lcg_vecDvecD_element_wise_device(const lcg_float *a, const lcg_float *b, lcg_float *c, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		c[i] = a[i] / b[i];
+	}
+	return;
+}
+
+void lcg_set2box_cuda(const lcg_float *low, const lcg_float *hig, lcg_float *a, 
+    int n, bool low_bound, bool hig_bound)
+{
+	int blockSize = 1024;
+	int numBlocks = (n+ blockSize - 1) / blockSize;
+	lcg_set2box_cuda_device<<<numBlocks, blockSize>>>(low, hig, a, n, low_bound, hig_bound);
+	return;
+}
+
+void lcg_smDcsr_get_diagonal(const int *A_ptr, const int *A_col, const lcg_float *A_val, const int A_len, lcg_float *A_diag, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (A_len+ blockSize - 1) / blockSize;
+	lcg_smDcsr_get_diagonal_device<<<numBlocks, blockSize>>>(A_ptr, A_col, A_val, A_len, A_diag);
+	return;
+}
+
+void lcg_vecMvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	lcg_vecMvecD_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
+	return;
+}
+
+void lcg_vecDvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	lcg_vecDvecD_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
+	return;
+}
--- a/src/lib/algebra_cuda.h
+++ b/src/lib/algebra_cuda.h
@@ -0,0 +1,88 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _ALGEBRA_CUDA_H
+#define _ALGEBRA_CUDA_H
+
+#include "algebra.h"
+
+#ifdef LibLCG_CUDA
+
+#include <cuda_runtime.h>
+
+/**
+ * @brief      Set the input value within a box constraint
+ *
+ * @param      a     low boundary
+ * @param      b     high boundary
+ * @param      in    input value
+ * @param      low_bound    Whether to include the low boundary value
+ * @param      hig_bound    Whether to include the high boundary value
+ *
+ * @return     box constrained value
+ */
+void lcg_set2box_cuda(const lcg_float *low, const lcg_float *hig, lcg_float *a, 
+    int n, bool low_bound = true, bool hig_bound = true);
+
+/**
+ * @brief      Extract diagonal elements from a square CUDA sparse matrix that is formatted in the CSR format
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  A_ptr   Row index pointer
+ * @param[in]  A_col   Column index
+ * @param[in]  A_val   Non-zero values of the matrix
+ * @param[in]  A_len   Dimension of the matrix
+ * @param      A_diag  Output digonal elements
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void lcg_smDcsr_get_diagonal(const int *A_ptr, const int *A_col, const lcg_float *A_val, const int A_len, lcg_float *A_diag, int bk_size = 1024);
+
+/**
+ * @brief      Element-wise muplication between two CUDA arries.
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  a     Pointer of the input array
+ * @param[in]  b     Pointer of the input array
+ * @param      c     Pointer of the output array
+ * @param[in]  n     Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void lcg_vecMvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size = 1024);
+
+/**
+ * @brief      Element-wise division between two CUDA arries.
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  a     Pointer of the input array
+ * @param[in]  b     Pointer of the input array
+ * @param      c     Pointer of the output array
+ * @param[in]  n     Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void lcg_vecDvecD_element_wise(const lcg_float *a, const lcg_float *b, lcg_float *c, int n, int bk_size = 1024);
+
+#endif // LibLCG_CUDA
+
+#endif //_ALGEBRA_CUDA_H
--- a/src/lib/algebra_eigen.cpp
+++ b/src/lib/algebra_eigen.cpp
@@ -0,0 +1,32 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "algebra_eigen.h"
+
+void lcg_set2box_eigen(const Eigen::VectorXd &low, const Eigen::VectorXd &hig, Eigen::VectorXd m)
+{
+	for (int i = 0; i < m.size(); i++)
+	{
+		m[i] = lcg_set2box(low[i], hig[i], m[i]);
+	}
+	return;
+}
--- a/src/lib/algebra_eigen.h
+++ b/src/lib/algebra_eigen.h
@@ -0,0 +1,43 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _ALGEBRA_EIGEN_H
+#define _ALGEBRA_EIGEN_H
+
+#include "algebra.h"
+
+#ifdef LibLCG_EIGEN
+
+#include "Eigen/Dense"
+
+/**
+ * @brief      Set the input value within a box constraint
+ *
+ * @param      low_bound    Whether to include the low boundary value
+ * @param      hig_bound    Whether to include the high boundary value
+ * @param      m            Returned values
+ */
+void lcg_set2box_eigen(const Eigen::VectorXd &low, const Eigen::VectorXd &hig, Eigen::VectorXd m);
+
+#endif // LibLCG_EIGEN
+
+#endif // _ALGEBRA_EIGEN_H
--- a/src/lib/clcg.cpp
+++ b/src/lib/clcg.cpp
@@ -0,0 +1,837 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "clcg.h"
+
+#include "cmath"
+
+#include "config.h"
+#ifdef LibLCG_OPENMP
+#include "omp.h"
+#endif
+
+typedef int (*clcg_solver_ptr)(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, 
+	const lcg_complex* B, const int n_size, const clcg_para* param, void* instance);
+
+int clbicg(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance);
+int clbicg_symmetric(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance);
+int clcgs(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance);
+int clbicgstab(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance);
+int cltfqmr(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance);
+
+int clcg_solver(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, 
+	const lcg_complex* B, const int n_size, const clcg_para* param, void* instance, 
+	clcg_solver_enum solver_id)
+{
+	clcg_solver_ptr cg_solver;
+	switch (solver_id)
+	{
+		case CLCG_BICG:
+			cg_solver = clbicg;
+			break;
+		case CLCG_BICG_SYM:
+			cg_solver = clbicg_symmetric;
+			break;
+		case CLCG_CGS:
+			cg_solver = clcgs;
+			break;
+		case CLCG_BICGSTAB:
+			cg_solver = clbicgstab;
+			break;
+		case CLCG_TFQMR:
+			cg_solver = cltfqmr;
+			break;
+		default:
+			cg_solver = clcgs;
+			break;
+	}
+
+	return cg_solver(Afp, Pfp, m, B, n_size, param, instance);
+}
+
+
+int clbicg(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+
+	int i;
+	lcg_complex *r1k = nullptr, *r2k = nullptr, *d1k = nullptr, *d2k = nullptr;
+	lcg_complex *Ax = nullptr;
+	r1k = clcg_malloc(n_size); r2k = clcg_malloc(n_size);
+	d1k = clcg_malloc(n_size); d2k = clcg_malloc(n_size);
+	Ax  = clcg_malloc(n_size);
+
+	lcg_complex ak, Ad1d2, r1r2_next, betak;
+
+	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+	for (i = 0; i < n_size; i++)
+	{
+		d1k[i] = r1k[i] = B[i] - Ax[i];
+		d2k[i] = r2k[i] = clcg_conjugate(&r1k[i]);
+	}
+
+	lcg_complex r1r2;
+	clcg_inner(r1r2, r2k, r1k, n_size);
+
+	lcg_float r0_square, rk_square;
+	lcg_complex r0_mod, rk_mod;
+	clcg_inner(rk_mod, r1k, r1k, n_size);
+	r0_square = rk_square = clcg_square(&rk_mod);
+	if (r0_square < 1.0) r0_square = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_square/r0_square <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
+		else residual = rk_square/r0_square;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, m, residual, &para, n_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, d1k, Ax, n_size, MatNormal, NonConjugate);
+		clcg_inner(Ad1d2, d2k, Ax, n_size);
+		ak = r1r2/Ad1d2;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			m[i] = m[i] + ak*d1k[i];
+			r1k[i] = r1k[i] - ak*Ax[i];
+		}
+
+		clcg_inner(rk_mod, r1k, r1k, n_size);
+		rk_square = clcg_square(&rk_mod);
+
+		Afp(instance, d2k, Ax, n_size, MatTranspose, Conjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			r2k[i] = r2k[i] - clcg_conjugate(&ak)*Ax[i];
+		}
+
+		for (i = 0; i < n_size; i++)
+		{
+			if (m[i] != m[i])
+			{
+				ret = CLCG_NAN_VALUE; goto func_ends;
+			}
+		}
+
+		clcg_inner(r1r2_next, r2k, r1k, n_size);
+		betak = r1r2_next/r1r2;
+		r1r2 = r1r2_next;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			d1k[i] = r1k[i] + betak*d1k[i];
+			d2k[i] = r2k[i] + clcg_conjugate(&betak)*d2k[i];
+		}
+	}
+
+	func_ends:
+	{
+		clcg_free(r1k);
+		clcg_free(r2k);
+		clcg_free(d1k);
+		clcg_free(d2k);
+		clcg_free(Ax);
+	}
+
+	return ret;
+}
+
+int clbicg_symmetric(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+
+	int i;
+	lcg_complex *rk = nullptr, *dk = nullptr;
+	lcg_complex *Ax = nullptr;
+	rk = clcg_malloc(n_size); dk = clcg_malloc(n_size);
+	Ax = clcg_malloc(n_size);
+
+	lcg_complex ak, rkrk2, betak, dkAx;
+
+	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+	for (i = 0; i < n_size; i++)
+	{
+		dk[i] = rk[i] = B[i] - Ax[i];
+	}
+
+	lcg_complex rkrk;
+	clcg_dot(rkrk, rk, rk, n_size);
+
+	lcg_float r0_square, rk_square;
+	lcg_complex r0_mod, rk_mod;
+	clcg_inner(rk_mod, rk, rk, n_size);
+	r0_square = rk_square = clcg_square(&rk_mod);
+	if (r0_square < 1.0) r0_square = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_square/r0_square <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
+		else residual = rk_square/r0_square;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, m, residual, &para, n_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, dk, Ax, n_size, MatNormal, NonConjugate);
+		clcg_dot(dkAx, dk, Ax, n_size);
+		ak = rkrk/dkAx;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			m[i] = m[i] + ak*dk[i];
+			rk[i] = rk[i] - ak*Ax[i];
+		}
+
+		clcg_inner(rk_mod, rk, rk, n_size);
+		rk_square = clcg_square(&rk_mod);
+
+		for (i = 0; i < n_size; i++)
+		{
+			if (m[i] != m[i])
+			{
+				ret = CLCG_NAN_VALUE; goto func_ends;
+			}
+		}
+
+		clcg_dot(rkrk2, rk, rk, n_size);
+		betak = rkrk2/rkrk;
+		rkrk = rkrk2;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			dk[i] = rk[i] + betak*dk[i];
+		}
+	}
+
+	func_ends:
+	{
+		clcg_free(rk);
+		clcg_free(dk);
+		clcg_free(Ax);
+	}
+
+	return ret;
+}
+
+int clcgs(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+
+	int i;
+	lcg_complex *rk = nullptr, *rbar0 = nullptr, *pk = nullptr;
+	lcg_complex *Ax = nullptr, *uk = nullptr, *qk = nullptr, *wk = nullptr; // w_k = u_{k-1} + q_k
+	rk = clcg_malloc(n_size); rbar0 = clcg_malloc(n_size);
+	pk = clcg_malloc(n_size); Ax  = clcg_malloc(n_size);
+	uk = clcg_malloc(n_size); qk  = clcg_malloc(n_size);
+	wk = clcg_malloc(n_size);
+
+	lcg_complex ak, rhok2, sigma, betak;
+
+	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+	for (i = 0; i < n_size; i++)
+	{
+		pk[i] = uk[i] = rk[i] = B[i] - Ax[i];
+	}
+
+	lcg_complex rhok;
+	do
+	{
+		clcg_vecrnd(rbar0, lcg_complex(1.0, 0.0), lcg_complex(2.0, 0.0), n_size);
+		clcg_inner(rhok, rbar0, rk, n_size);
+	} while (clcg_module(&rhok) < 1e-8);
+
+	lcg_float r0_square, rk_square;
+	lcg_complex r0_mod, rk_mod;
+	clcg_inner(rk_mod, rk, rk, n_size);
+	r0_square = rk_square = clcg_square(&rk_mod);
+	if (r0_square < 1.0) r0_square = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_square/r0_square <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
+		else residual = rk_square/r0_square;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, m, residual, &para, n_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, pk, Ax, n_size, MatNormal, NonConjugate); // vk = Apk
+		clcg_inner(sigma, rbar0, Ax, n_size);
+		ak = rhok/sigma;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			qk[i] = uk[i] - ak*Ax[i];
+			wk[i] = uk[i] + qk[i];
+		}
+
+		Afp(instance, wk, Ax, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			m[i] = m[i] + ak*wk[i];
+			rk[i] = rk[i] - ak*Ax[i];
+		}
+
+		clcg_inner(rk_mod, rk, rk, n_size);
+		rk_square = clcg_square(&rk_mod);
+
+		for (i = 0; i < n_size; i++)
+		{
+			if (m[i] != m[i])
+			{
+				ret = CLCG_NAN_VALUE; goto func_ends;
+			}
+		}
+
+		clcg_inner(rhok2, rbar0, rk, n_size);
+		betak = rhok2/rhok;
+		rhok = rhok2;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			uk[i] = rk[i] + betak*qk[i];
+			pk[i] = uk[i] + betak*(qk[i] + betak*pk[i]);
+		}
+	}
+
+	func_ends:
+	{
+		clcg_free(rk);
+		clcg_free(rbar0);
+		clcg_free(pk);
+		clcg_free(Ax);
+		clcg_free(uk);
+		clcg_free(qk);
+		clcg_free(wk);
+	}
+
+	return ret;
+}
+
+int clbicgstab(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance)
+{
+	// set BICGSTAB parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+
+	int i;
+	lcg_complex *rk = nullptr, *rbar0 = nullptr, *pk = nullptr, *sk = nullptr;
+	lcg_complex *Ap = nullptr, *As = nullptr;
+	rk = clcg_malloc(n_size); rbar0 = clcg_malloc(n_size);
+	pk = clcg_malloc(n_size); sk = clcg_malloc(n_size);
+	Ap = clcg_malloc(n_size); As = clcg_malloc(n_size);
+
+	lcg_complex ak, rhok2, sigma, omega, betak, Ass, AsAs;
+
+	Afp(instance, m, Ap, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+	for (i = 0; i < n_size; i++)
+	{
+		pk[i] = rk[i] = B[i] - Ap[i];
+	}
+
+	lcg_complex rhok;
+	do
+	{
+		clcg_vecrnd(rbar0, lcg_complex(1.0, 0.0), lcg_complex(2.0, 0.0), n_size);
+		clcg_inner(rhok, rbar0, rk, n_size);
+	} while (clcg_module(&rhok) < 1e-8);
+
+	lcg_float r0_square, rk_square;
+	lcg_complex r0_mod, rk_mod;
+	clcg_inner(rk_mod, rk, rk, n_size);
+	r0_square = rk_square = clcg_square(&rk_mod);
+	if (r0_square < 1.0) r0_square = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_square/r0_square <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = sqrt(rk_square)/n_size;
+		else residual = rk_square/r0_square;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, m, residual, &para, n_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, pk, Ap, n_size, MatNormal, NonConjugate);
+		clcg_inner(sigma, rbar0, Ap, n_size);
+		ak = rhok/sigma;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			sk[i] = rk[i] - ak*Ap[i];
+		}
+
+		Afp(instance, sk, As, n_size, MatNormal, NonConjugate);
+		clcg_inner(Ass, As, sk, n_size);
+		clcg_inner(AsAs, As, As, n_size);
+		omega = Ass/AsAs;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			m[i] = m[i] + ak*pk[i] + omega*sk[i];
+			rk[i] = sk[i] - omega*As[i];
+		}
+
+		clcg_inner(rk_mod, rk, rk, n_size);
+		rk_square = clcg_square(&rk_mod);
+
+		for (i = 0; i < n_size; i++)
+		{
+			if (m[i] != m[i])
+			{
+				ret = CLCG_NAN_VALUE; goto func_ends;
+			}
+		}
+
+		clcg_inner(rhok2, rbar0, rk, n_size);
+		betak = rhok2*ak/(rhok*omega);
+		rhok = rhok2;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			pk[i] = rk[i] + betak*(pk[i] - omega*Ap[i]);
+		}
+	}
+
+	func_ends:
+	{
+		clcg_free(rk);
+		clcg_free(rbar0);
+		clcg_free(pk);
+		clcg_free(sk);
+		clcg_free(Ap);
+		clcg_free(As);
+	}
+
+	return ret;
+}
+
+int cltfqmr(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, const lcg_complex* B, 
+	const int n_size, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+
+	int i, j;
+	lcg_complex *pk = nullptr, *uk = nullptr;
+	lcg_complex *vk = nullptr, *dk = nullptr;
+	lcg_complex *rbar0 = nullptr, *rk = nullptr;
+	lcg_complex *Ax = nullptr, *qk = nullptr;
+	lcg_complex *uqk = nullptr;
+	pk = clcg_malloc(n_size); uk = clcg_malloc(n_size);
+	vk = clcg_malloc(n_size); dk = clcg_malloc(n_size);
+	rbar0 = clcg_malloc(n_size); rk = clcg_malloc(n_size);
+	Ax = clcg_malloc(n_size); qk = clcg_malloc(n_size);
+	uqk = clcg_malloc(n_size);
+
+	Afp(instance, m, Ax, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+	for (i = 0; i < n_size; i++)
+	{
+		pk[i] = uk[i] = rk[i] = B[i] - Ax[i];
+		clcg_set(&dk[i], 0.0, 0.0);
+	}
+
+	lcg_complex rho, rk_mod, rk_mod2;
+	lcg_float r0_square, rk_square;
+	clcg_inner(rk_mod, rk, rk, n_size);
+	r0_square = rk_square = clcg_square(&rk_mod);
+	if (r0_square < 1.0) r0_square = 1.0;
+
+	do
+	{
+		clcg_vecrnd(rbar0, lcg_complex(1.0, 0.0), lcg_complex(2.0, 0.0), n_size);
+		clcg_inner(rho, rbar0, rk, n_size);
+	} while (clcg_module(&rho) < 1e-8);
+
+	lcg_float theta = 0.0, omega = clcg_module(&rk_mod);
+	lcg_float residual, tao = omega;
+	lcg_complex sigma, alpha, betak, rho2, sign, eta(0.0, 0.0);
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_square)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, sqrt(rk_square)/n_size, &para, n_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_square/r0_square <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, m, rk_square/r0_square, &para, n_size, 0);
+		}
+		goto func_ends;
+	}
+
+	while(1)
+	{
+		Afp(instance, pk, vk, n_size, MatNormal, NonConjugate);
+
+		clcg_inner(sigma, rbar0, vk, n_size);
+		alpha = rho/sigma;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			qk[i] = uk[i] - alpha*vk[i];
+			uqk[i] = uk[i] + qk[i];
+		}
+
+		Afp(instance, uqk, Ax, n_size, MatNormal, NonConjugate);
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			rk[i] = rk[i] - alpha*Ax[i];
+		}
+
+		clcg_inner(rk_mod2, rk, rk, n_size);
+
+		for (j = 1; j <= 2; j++)
+		{
+
+			if (para.abs_diff) residual = sqrt(rk_square)/n_size;
+			else residual = rk_square/r0_square;
+
+			if (Pfp != nullptr)
+			{
+				if (Pfp(instance, m, residual, &para, n_size, t))
+				{
+					ret = CLCG_STOP; goto func_ends;
+				}
+			}
+
+			if (residual <= para.epsilon)
+			{
+				ret = CLCG_CONVERGENCE; goto func_ends;
+			}
+
+			if (para.max_iterations > 0 && t+1 > para.max_iterations)
+			{
+				ret = LCG_REACHED_MAX_ITERATIONS;
+				break;
+			}
+			
+			t++;
+
+			sign = theta*theta*(eta/alpha);
+
+			if (j == 1)
+			{
+				omega = sqrt(clcg_module(&rk_mod)*clcg_module(&rk_mod2));
+
+#pragma omp parallel for private (i) schedule(guided)
+				for (i = 0; i < n_size; i++)
+				{
+					dk[i] = uk[i] + sign*dk[i];
+				}
+			}
+			else
+			{
+				omega = clcg_module(&rk_mod2);
+
+#pragma omp parallel for private (i) schedule(guided)
+				for (i = 0; i < n_size; i++)
+				{
+					dk[i] = qk[i] + sign*dk[i];
+				}
+			}
+
+			theta = omega/tao;
+			tao = omega/sqrt(1.0+theta*theta);
+			eta = (1.0/(1.0+theta*theta))*alpha;
+
+#pragma omp parallel for private (i) schedule(guided)
+			for (i = 0; i < n_size; i++)
+			{
+				m[i] = m[i] + eta*dk[i];
+			}
+
+			for (i = 0; i < n_size; i++)
+			{
+				if (m[i] != m[i])
+				{
+					ret = CLCG_NAN_VALUE; goto func_ends;
+				}
+			}
+		}
+		rk_mod = rk_mod2;
+		rk_square = clcg_square(&rk_mod);
+
+		clcg_inner(rho2, rbar0, rk, n_size);
+		betak = rho2/rho;
+		rho = rho2;
+
+#pragma omp parallel for private (i) schedule(guided)
+		for (i = 0; i < n_size; i++)
+		{
+			uk[i] = rk[i] + betak*qk[i];
+			pk[i] = uk[i] + betak*(qk[i] + betak*pk[i]);
+		}
+	}
+
+	func_ends:
+	{
+		clcg_free(pk);
+		clcg_free(uk);
+		clcg_free(vk);
+		clcg_free(dk);
+		clcg_free(rbar0);
+		clcg_free(rk);
+		clcg_free(Ax);
+		clcg_free(qk);
+		clcg_free(uqk);
+	}
+
+	return ret;
+}
--- a/src/lib/clcg.h
+++ b/src/lib/clcg.h
@@ -0,0 +1,78 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _CLCG_H
+#define _CLCG_H
+
+#include "lcg_complex.h"
+#include "util.h"
+
+/**
+ * @brief  Callback interface for calculating the complex product of a N*N matrix 'A' multiplied 
+ * by a complex vertical vector 'x'.
+ * 
+ * @param  instance    The user data sent for the clcg_solver() functions by the client.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ * @param  x_size      Size of x and column/row numbers of A.
+ * @param  layout      Whether to use the transpose of A for calculation.
+ * @param  conjugate   Whether to use the conjugate of A for calculation.
+ */
+typedef void (*clcg_axfunc_ptr)(void *instance, const lcg_complex *x, lcg_complex *prod_Ax, 
+	const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary.
+ * 
+ * @param    instance    The user data sent for the clcg_solver() functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    n_size      The size of the variables
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*clcg_progress_ptr)(void* instance, const lcg_complex* m, 
+	const lcg_float converge, const clcg_para* param, const int n_size, const int k);
+
+/**
+ * @brief      A combined complex conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver(clcg_axfunc_ptr Afp, clcg_progress_ptr Pfp, lcg_complex* m, 
+	const lcg_complex* B, const int n_size, const clcg_para* param, void* instance, 
+	clcg_solver_enum solver_id = CLCG_BICG);
+
+#endif // _CLCG_H
--- a/src/lib/clcg_cuda.cu
+++ b/src/lib/clcg_cuda.cu
@@ -0,0 +1,529 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "cmath"
+#include "ctime"
+#include "iostream"
+
+#include "clcg_cuda.h"
+
+
+typedef int (*cuda_solver_ptr)(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clbicg(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clbicg_symmetric(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clcg_solver_cuda(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, const cuDoubleComplex* B, 
+    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
+    cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
+{
+    cuda_solver_ptr cg_solver;
+    switch (solver_id)
+	{
+		case CLCG_BICG:
+			cg_solver = clbicg;
+			break;
+		case CLCG_BICG_SYM:
+			cg_solver = clbicg_symmetric;
+			break;
+		default:
+			return CLCG_UNKNOWN_SOLVER;
+	}
+
+	return cg_solver(Afp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+typedef int (*cuda_precondtioned_solver_ptr)(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, 
+    cuDoubleComplex* m, const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, 
+    void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clpcg(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clcg_solver_preconditioned_cuda(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, 
+    cuDoubleComplex* m, const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
+{
+    cuda_precondtioned_solver_ptr cgp_solver;
+    switch (solver_id)
+	{
+		case CLCG_PCG:
+			cgp_solver = clpcg; break;
+		default:
+			return CLCG_UNKNOWN_SOLVER;
+	}
+
+	return cgp_solver(Afp, Mfp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+int clbicg(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+    // set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	cuDoubleComplex *d_m = nullptr, *d_B = nullptr;
+	cuDoubleComplex *r1k = nullptr, *r2k = nullptr;
+	cuDoubleComplex *d1k = nullptr, *d2k = nullptr, *Ax = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&r1k, n_size * sizeof(cuDoubleComplex));
+	cudaMalloc(&r2k, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&d1k, n_size * sizeof(cuDoubleComplex));
+	cudaMalloc(&d2k, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&Ax, n_size * sizeof(cuDoubleComplex));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+
+    cusparseDnVecDescr_t dvec_m, dvec_d1k, dvec_d2k, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_d1k, n_size, d1k, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_d2k, n_size, d2k, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_64F);
+
+    cuDoubleComplex one, none;
+    one.x = 1.0; one.y = 0.0;
+    none.x = -1.0; none.y = 0.0;
+	cuDoubleComplex ak, nak, conj_ak, Ad1d2, r1r2_next, betak, conj_betak;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+    // r0 = B - Ax
+    cudaMemcpy(r1k, d_B, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // r0 = B
+    cublasZaxpy_v2(cub_handle, n_size, &none, Ax, 1, r1k, 1); // r0 -= Ax
+    cudaMemcpy(d1k, r1k, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // d0 = r0
+
+	clcg_vecZ_conjugate(r1k, r2k, n_size);
+	cudaMemcpy(d2k, r2k, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice);
+
+	cuDoubleComplex r1r2;
+    cublasZdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2);
+
+	lcg_float rk_mod;
+	cublasDznrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
+
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = rk_mod/n_size;
+		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_d1k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+        
+        cublasZdotc_v2(cub_handle, n_size, d2k, 1, Ax, 1, &Ad1d2);
+        ak = cuCdiv(r1r2, Ad1d2);
+        nak = cuCmul(none, ak);
+		conj_ak = cuConj(nak);
+
+        cublasZaxpy_v2(cub_handle, n_size, &ak, d1k, 1, d_m, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &nak, Ax, 1, r1k, 1);
+
+        cublasDznrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
+
+		Afp(instance, cub_handle, cus_handle, dvec_d2k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE);
+
+		cublasZaxpy_v2(cub_handle, n_size, &conj_ak, Ax, 1, r2k, 1);
+
+		cublasZdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2_next);
+		betak = cuCdiv(r1r2_next, r1r2);
+		conj_betak = cuConj(betak);
+		r1r2 = r1r2_next;
+
+        cublasZscal_v2(cub_handle, n_size, &betak, d1k, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &one, r1k, 1, d1k, 1);
+
+		cublasZscal_v2(cub_handle, n_size, &conj_betak, d2k, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &one, r2k, 1, d2k, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(r1k);
+		cudaFree(r2k);
+		cudaFree(d1k);
+		cudaFree(d2k);	
+		cudaFree(Ax);
+        cusparseDestroyDnVec(dvec_m);
+        cusparseDestroyDnVec(dvec_d1k);
+		cusparseDestroyDnVec(dvec_d2k);
+        cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
+
+int clbicg_symmetric(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+    // set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	cuDoubleComplex *d_m = nullptr, *d_B = nullptr;
+	cuDoubleComplex *rk = nullptr, *dk = nullptr, *Ax = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&rk, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&dk, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&Ax, n_size * sizeof(cuDoubleComplex));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+
+    cusparseDnVecDescr_t dvec_m, dvec_dk, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_64F);
+
+    cuDoubleComplex one, none;
+    one.x = 1.0; one.y = 0.0;
+    none.x = -1.0; none.y = 0.0;
+	cuDoubleComplex ak, nak, rkrk2, betak, dkAx;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+    // r0 = B - Ax
+    cudaMemcpy(rk, d_B, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // r0 = B
+    cublasZaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
+    cudaMemcpy(dk, rk, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // d0 = r0
+
+	cuDoubleComplex rkrk;
+    cublasZdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk);
+
+	lcg_float rk_mod;
+	cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = rk_mod/n_size;
+		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+        
+        cublasZdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
+        ak = cuCdiv(rkrk, dkAx);
+        nak = cuCmul(none, ak);
+
+        cublasZaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
+
+        cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+		cublasZdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk2);
+		betak = cuCdiv(rkrk2, rkrk);
+		rkrk = rkrk2;
+
+        cublasZscal_v2(cub_handle, n_size, &betak, dk, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &one, rk, 1, dk, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(rk);
+		cudaFree(dk);
+		cudaFree(Ax);
+        cusparseDestroyDnVec(dvec_m);
+        cusparseDestroyDnVec(dvec_dk);
+        cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
+
+int clpcg(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, 
+    const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+    // set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	cuDoubleComplex *d_m = nullptr, *d_B = nullptr;
+    cuDoubleComplex *rk = nullptr, *dk = nullptr, *sk = nullptr, *Ax = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&rk, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&dk, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&sk, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&Ax, n_size * sizeof(cuDoubleComplex));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+
+    cusparseDnVecDescr_t dvec_m, dvec_rk, dvec_dk, dvec_sk, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_64F);
+    cusparseCreateDnVec(&dvec_rk, n_size, rk, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_64F);
+    cusparseCreateDnVec(&dvec_sk, n_size, sk, CUDA_C_64F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_64F);
+
+    cuDoubleComplex one, none;
+    one.x = 1.0; one.y = 0.0;
+    none.x = -1.0; none.y = 0.0;
+    cuDoubleComplex ak, nak, d_old, betak, dkAx;
+
+    Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+    // r0 = B - Ax
+    cudaMemcpy(rk, d_B, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice); // r0 = B
+    cublasZaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
+
+	Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_dk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+	cuDoubleComplex d_new;
+    cublasZdotu_v2(cub_handle, n_size, rk, 1, dk, 1, &d_new);
+
+	lcg_float rk_mod;
+	cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+    int ret, t = 0;
+	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = rk_mod/n_size;
+		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+        cublasZdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
+		ak = cuCdiv(d_new, dkAx);
+        nak = cuCmul(none, ak);
+
+        cublasZaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
+
+        cublasDznrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+        Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_sk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+		d_old = d_new;
+        cublasZdotu_v2(cub_handle, n_size, rk, 1, sk, 1, &d_new);
+
+		betak = cuCdiv(d_new, d_old);
+
+        cublasZscal_v2(cub_handle, n_size, &betak, dk, 1);
+        cublasZaxpy_v2(cub_handle, n_size, &one, sk, 1, dk, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(rk);
+		cudaFree(dk);
+		cudaFree(sk);
+		cudaFree(Ax);
+        cusparseDestroyDnVec(dvec_m);
+        cusparseDestroyDnVec(dvec_rk);
+        cusparseDestroyDnVec(dvec_dk);
+        cusparseDestroyDnVec(dvec_sk);
+        cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
--- a/src/lib/clcg_cuda.h
+++ b/src/lib/clcg_cuda.h
@@ -0,0 +1,109 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _CLCG_CUDA_H
+#define _CLCG_CUDA_H
+
+#include "util.h"
+#include "lcg_complex_cuda.h"
+
+#ifdef LibLCG_CUDA
+
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+
+/**
+ * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
+ * by a vertical vector 'x'. Note that both A and x are hosted on the GPU device.
+ * 
+ * @param  instance    The user data sent for the lcg_solver_cuda() functions by the client.
+ * @param  cub_handle  Handler of the cublas object.
+ * @param  cus_handle  Handlee of the cusparse object.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ * @param  n_size      Size of x and column/row numbers of A.
+ */
+typedef void (*clcg_axfunc_cuda_ptr)(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, cusparseOperation_t oper_t);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary. Note that m is hosted on the GPU device.
+ * 
+ * @param    instance    The user data sent for the lcg_solver() functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    n_size      The size of the variables
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*clcg_progress_cuda_ptr)(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
+	const clcg_para* param, const int n_size, const int nz_size, const int k);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_BICG.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver_cuda(clcg_axfunc_cuda_ptr Afp, clcg_progress_cuda_ptr Pfp, cuDoubleComplex* m, const cuDoubleComplex* B, 
+    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
+    cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_BICG);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver_preconditioned_cuda(clcg_axfunc_cuda_ptr Afp, clcg_axfunc_cuda_ptr Mfp, clcg_progress_cuda_ptr Pfp, 
+    cuDoubleComplex* m, const cuDoubleComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_PCG);
+
+#endif // LibLCG_CUDA
+
+#endif // _CLCG_CUDA_H
--- a/src/lib/clcg_cudaf.cu
+++ b/src/lib/clcg_cudaf.cu
@@ -0,0 +1,529 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "cmath"
+#include "ctime"
+#include "iostream"
+
+#include "clcg_cudaf.h"
+
+
+typedef int (*cuda_solver_ptr)(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clbicg(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clbicg_symmetric(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clcg_solver_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, const cuComplex* B, 
+    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
+    cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
+{
+    cuda_solver_ptr cg_solver;
+    switch (solver_id)
+	{
+		case CLCG_BICG:
+			cg_solver = clbicg;
+			break;
+		case CLCG_BICG_SYM:
+			cg_solver = clbicg_symmetric;
+			break;
+		default:
+			return CLCG_UNKNOWN_SOLVER;
+	}
+
+	return cg_solver(Afp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+typedef int (*cuda_precondtioned_solver_ptr)(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, 
+    cuComplex* m, const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, 
+    void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clpcg(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int clcg_solver_preconditioned_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, 
+    cuComplex* m, const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id)
+{
+    cuda_precondtioned_solver_ptr cgp_solver;
+    switch (solver_id)
+	{
+		case CLCG_PCG:
+			cgp_solver = clpcg; break;
+		default:
+			return CLCG_UNKNOWN_SOLVER;
+	}
+
+	return cgp_solver(Afp, Mfp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+int clbicg(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+    // set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	cuComplex *d_m = nullptr, *d_B = nullptr;
+	cuComplex *r1k = nullptr, *r2k = nullptr;
+	cuComplex *d1k = nullptr, *d2k = nullptr, *Ax = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(cuComplex));
+	cudaMalloc(&d_B, n_size * sizeof(cuComplex));
+    cudaMalloc(&r1k, n_size * sizeof(cuComplex));
+	cudaMalloc(&r2k, n_size * sizeof(cuComplex));
+    cudaMalloc(&d1k, n_size * sizeof(cuComplex));
+	cudaMalloc(&d2k, n_size * sizeof(cuComplex));
+    cudaMalloc(&Ax, n_size * sizeof(cuComplex));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
+
+    cusparseDnVecDescr_t dvec_m, dvec_d1k, dvec_d2k, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_d1k, n_size, d1k, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_d2k, n_size, d2k, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_32F);
+
+    cuComplex one, none;
+    one.x = 1.0; one.y = 0.0;
+    none.x = -1.0; none.y = 0.0;
+	cuComplex ak, nak, conj_ak, Ad1d2, r1r2_next, betak, conj_betak;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+    // r0 = B - Ax
+    cudaMemcpy(r1k, d_B, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // r0 = B
+    cublasCaxpy_v2(cub_handle, n_size, &none, Ax, 1, r1k, 1); // r0 -= Ax
+    cudaMemcpy(d1k, r1k, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // d0 = r0
+
+	clcg_vecC_conjugate(r1k, r2k, n_size);
+	cudaMemcpy(d2k, r2k, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice);
+
+	cuComplex r1r2;
+    cublasCdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2);
+
+	float rk_mod;
+	cublasScnrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
+
+	float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = rk_mod/n_size;
+		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_d1k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+        
+        cublasCdotc_v2(cub_handle, n_size, d2k, 1, Ax, 1, &Ad1d2);
+        ak = cuCdivf(r1r2, Ad1d2);
+        nak = cuCmulf(none, ak);
+		conj_ak = cuConjf(nak);
+
+        cublasCaxpy_v2(cub_handle, n_size, &ak, d1k, 1, d_m, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &nak, Ax, 1, r1k, 1);
+
+        cublasScnrm2_v2(cub_handle, n_size, r1k, 1, &rk_mod);
+
+		Afp(instance, cub_handle, cus_handle, dvec_d2k, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE);
+
+		cublasCaxpy_v2(cub_handle, n_size, &conj_ak, Ax, 1, r2k, 1);
+
+		cublasCdotc_v2(cub_handle, n_size, r2k, 1, r1k, 1, &r1r2_next);
+		betak = cuCdivf(r1r2_next, r1r2);
+		conj_betak = cuConjf(betak);
+		r1r2 = r1r2_next;
+
+        cublasCscal_v2(cub_handle, n_size, &betak, d1k, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &one, r1k, 1, d1k, 1);
+
+		cublasCscal_v2(cub_handle, n_size, &conj_betak, d2k, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &one, r2k, 1, d2k, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(cuComplex), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(r1k);
+		cudaFree(r2k);
+		cudaFree(d1k);
+		cudaFree(d2k);	
+		cudaFree(Ax);
+        cusparseDestroyDnVec(dvec_m);
+        cusparseDestroyDnVec(dvec_d1k);
+		cusparseDestroyDnVec(dvec_d2k);
+        cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
+
+int clbicg_symmetric(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+    // set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	cuComplex *d_m = nullptr, *d_B = nullptr;
+	cuComplex *rk = nullptr, *dk = nullptr, *Ax = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_B, n_size * sizeof(cuDoubleComplex));
+    cudaMalloc(&rk, n_size * sizeof(cuComplex));
+    cudaMalloc(&dk, n_size * sizeof(cuComplex));
+    cudaMalloc(&Ax, n_size * sizeof(cuComplex));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
+
+    cusparseDnVecDescr_t dvec_m, dvec_dk, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_32F);
+
+    cuComplex one, none;
+    one.x = 1.0; one.y = 0.0;
+    none.x = -1.0; none.y = 0.0;
+	cuComplex ak, nak, rkrk2, betak, dkAx;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+    // r0 = B - Ax
+    cudaMemcpy(rk, d_B, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // r0 = B
+    cublasCaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
+    cudaMemcpy(dk, rk, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // d0 = r0
+
+	cuComplex rkrk;
+    cublasCdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk);
+
+	float rk_mod;
+	cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+	float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = rk_mod/n_size;
+		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+        
+        cublasCdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
+        ak = cuCdivf(rkrk, dkAx);
+        nak = cuCmulf(none, ak);
+
+        cublasCaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
+
+        cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+		cublasCdotu_v2(cub_handle, n_size, rk, 1, rk, 1, &rkrk2);
+		betak = cuCdivf(rkrk2, rkrk);
+		rkrk = rkrk2;
+
+        cublasCscal_v2(cub_handle, n_size, &betak, dk, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &one, rk, 1, dk, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(cuComplex), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(rk);
+		cudaFree(dk);
+		cudaFree(Ax);
+        cusparseDestroyDnVec(dvec_m);
+        cusparseDestroyDnVec(dvec_dk);
+        cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
+
+int clpcg(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, 
+    const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+    // set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return CLCG_INVALID_POINTER;
+	if (B == nullptr) return CLCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	cuComplex *d_m = nullptr, *d_B = nullptr;
+    cuComplex *rk = nullptr, *dk = nullptr, *sk = nullptr, *Ax = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(cuComplex));
+    cudaMalloc(&d_B, n_size * sizeof(cuComplex));
+    cudaMalloc(&rk, n_size * sizeof(cuComplex));
+    cudaMalloc(&dk, n_size * sizeof(cuComplex));
+    cudaMalloc(&sk, n_size * sizeof(cuComplex));
+    cudaMalloc(&Ax, n_size * sizeof(cuComplex));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(cuComplex), cudaMemcpyHostToDevice);
+
+    cusparseDnVecDescr_t dvec_m, dvec_rk, dvec_dk, dvec_sk, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_C_32F);
+    cusparseCreateDnVec(&dvec_rk, n_size, rk, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_C_32F);
+    cusparseCreateDnVec(&dvec_sk, n_size, sk, CUDA_C_32F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_C_32F);
+
+    cuComplex one, none;
+    one.x = 1.0; one.y = 0.0;
+    none.x = -1.0; none.y = 0.0;
+    cuComplex ak, nak, d_old, betak, dkAx;
+
+    Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+    // r0 = B - Ax
+    cudaMemcpy(rk, d_B, n_size * sizeof(cuComplex), cudaMemcpyDeviceToDevice); // r0 = B
+    cublasCaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
+
+	Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_dk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+	cuComplex d_new;
+    cublasCdotu_v2(cub_handle, n_size, rk, 1, dk, 1, &d_new);
+
+	float rk_mod;
+	cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+	float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+    int ret, t = 0;
+	if (para.abs_diff && rk_mod/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod*rk_mod/(r0_mod*r0_mod) <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod*rk_mod/(r0_mod*r0_mod), &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = rk_mod/n_size;
+		else residual = rk_mod*rk_mod/(r0_mod*r0_mod);
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Ax, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+        cublasCdotu_v2(cub_handle, n_size, dk, 1, Ax, 1, &dkAx);
+		ak = cuCdivf(d_new, dkAx);
+        nak = cuCmulf(none, ak);
+
+        cublasCaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
+
+        cublasScnrm2_v2(cub_handle, n_size, rk, 1, &rk_mod);
+
+        Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_sk, n_size, nz_size, CUSPARSE_OPERATION_NON_TRANSPOSE);
+
+		d_old = d_new;
+        cublasCdotu_v2(cub_handle, n_size, rk, 1, sk, 1, &d_new);
+
+		betak = cuCdivf(d_new, d_old);
+
+        cublasCscal_v2(cub_handle, n_size, &betak, dk, 1);
+        cublasCaxpy_v2(cub_handle, n_size, &one, sk, 1, dk, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(cuComplex), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(rk);
+		cudaFree(dk);
+		cudaFree(sk);
+		cudaFree(Ax);
+        cusparseDestroyDnVec(dvec_m);
+        cusparseDestroyDnVec(dvec_rk);
+        cusparseDestroyDnVec(dvec_dk);
+        cusparseDestroyDnVec(dvec_sk);
+        cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
--- a/src/lib/clcg_cudaf.h
+++ b/src/lib/clcg_cudaf.h
@@ -0,0 +1,109 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _CLCG_CUDA_FLOAT_H
+#define _CLCG_CUDA_FLOAT_H
+
+#include "util.h"
+#include "lcg_complex_cuda.h"
+
+#ifdef LibLCG_CUDA
+
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+
+/**
+ * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
+ * by a vertical vector 'x'. Note that both A and x are hosted on the GPU device.
+ * 
+ * @param  instance    The user data sent for the lcg_solver_cuda() functions by the client.
+ * @param  cub_handle  Handler of the cublas object.
+ * @param  cus_handle  Handlee of the cusparse object.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ * @param  n_size      Size of x and column/row numbers of A.
+ */
+typedef void (*clcg_axfunc_cudaf_ptr)(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, cusparseOperation_t oper_t);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary. Note that m is hosted on the GPU device.
+ * 
+ * @param    instance    The user data sent for the lcg_solver() functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    n_size      The size of the variables
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*clcg_progress_cudaf_ptr)(void* instance, const cuComplex* m, const float converge, 
+	const clcg_para* param, const int n_size, const int nz_size, const int k);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_BICG.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_progress_cudaf_ptr Pfp, cuComplex* m, const cuComplex* B, 
+    const int n_size, const int nz_size, const clcg_para* param, void* instance, cublasHandle_t cub_handle, 
+    cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_BICG);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver_preconditioned_cuda(clcg_axfunc_cudaf_ptr Afp, clcg_axfunc_cudaf_ptr Mfp, clcg_progress_cudaf_ptr Pfp, 
+    cuComplex* m, const cuComplex* B, const int n_size, const int nz_size, const clcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, clcg_solver_enum solver_id = CLCG_PCG);
+
+#endif // LibLCG_CUDA
+
+#endif // _CLCG_CUDA_FLOAT_H
--- a/src/lib/clcg_eigen.cpp
+++ b/src/lib/clcg_eigen.cpp
@@ -0,0 +1,777 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "cmath"
+#include "ctime"
+#include "iostream"
+
+#include "clcg_eigen.h"
+
+#include "config.h"
+#ifdef LibLCG_OPENMP
+#include "omp.h"
+#endif
+
+
+typedef int (*eigen_solver_ptr)(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+
+int clbicg(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+int clbicg_symmetric(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+int clcgs(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+int cltfqmr(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+
+int clcg_solver_eigen(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id)
+{
+	eigen_solver_ptr cg_solver;
+	switch (solver_id)
+	{
+		case CLCG_BICG:
+			cg_solver = clbicg;
+			break;
+		case CLCG_BICG_SYM:
+			cg_solver = clbicg_symmetric;
+			break;
+		case CLCG_CGS:
+			cg_solver = clcgs;
+			break;
+		case CLCG_TFQMR:
+			cg_solver = cltfqmr;
+			break;
+		default:
+			return CLCG_UNKNOWN_SOLVER;
+	}
+
+	return cg_solver(Afp, Pfp, m, B, param, instance);
+}
+
+
+typedef int (*eigen_preconditioned_solver_ptr)(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+
+int clpcg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+int clpbicg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance);
+
+int clcg_solver_preconditioned_eigen(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id)
+{
+	eigen_preconditioned_solver_ptr cgp_solver;
+	switch (solver_id)
+	{
+		case CLCG_PCG:
+			cgp_solver = clpcg; break;
+		case CLCG_PBICG:
+			cgp_solver = clpbicg; break;
+		default:
+			return CLCG_UNKNOWN_SOLVER;
+	}
+
+	return cgp_solver(Afp, Mfp, Pfp, m, B, param, instance);
+}
+
+
+int clbicg(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	int n_size = B.size();
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	std::complex<lcg_float> ak, Ad1d2, r1r2_next, betak;
+	Eigen::VectorXcd r1k(n_size), r2k(n_size), d1k(n_size), d2k(n_size);
+	Eigen::VectorXcd Ax(n_size);
+
+	Afp(instance, m, Ax, MatNormal, NonConjugate);
+
+	d1k = r1k = B - Ax;
+	d2k = r2k = r1k.conjugate();
+
+	// Eigen's dot is inner product
+	std::complex<lcg_float> r1r2 = r2k.dot(r1k);
+
+	lcg_float rk_mod = std::norm(r1k.dot(r1k));
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, &m, residual, &para, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, d1k, Ax, MatNormal, NonConjugate);
+		Ad1d2 = d2k.dot(Ax);
+		ak = r1r2/Ad1d2;
+
+		m = m + ak*d1k;
+		r1k = r1k - ak*Ax;
+
+		rk_mod = std::norm(r1k.dot(r1k));
+
+		Afp(instance, d2k, Ax, MatTranspose, Conjugate);
+
+		r2k = r2k - std::conj(ak)*Ax;
+
+		r1r2_next = r2k.dot(r1k);
+		betak = r1r2_next/r1r2;
+		r1r2 = r1r2_next;
+
+		d1k = r1k + betak*d1k;
+		d2k = r2k + std::conj(betak)*d2k;
+	}
+
+	func_ends:
+	{
+		r1k.resize(0);
+		r2k.resize(0);
+		d1k.resize(0);
+		d2k.resize(0);
+		Ax.resize(0);
+	}
+
+	return ret;
+}
+
+int clbicg_symmetric(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	int n_size = B.size();
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	std::complex<lcg_float> ak, rkrk2, betak, dkAx;
+	Eigen::VectorXcd rk(n_size), dk(n_size), Ax(n_size);
+
+	Afp(instance, m, Ax, MatNormal, NonConjugate);
+
+	dk = rk = (B - Ax);
+
+	std::complex<lcg_float> rkrk = rk.conjugate().dot(rk);
+
+	lcg_float rk_mod = std::norm(rk.dot(rk));
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, &m, residual, &para, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, dk, Ax, MatNormal, NonConjugate);
+		dkAx = dk.conjugate().dot(Ax);
+		ak = rkrk/dkAx;
+
+		m += ak*dk;
+		rk -= ak*Ax;
+
+		rk_mod = std::norm(rk.dot(rk));
+
+		rkrk2 = rk.conjugate().dot(rk);
+		betak = rkrk2/rkrk;
+		rkrk = rkrk2;
+
+		dk = rk + betak*dk;
+	}
+
+	func_ends:
+	{
+		rk.resize(0);
+		dk.resize(0);
+		Ax.resize(0);
+	}
+
+	return ret;
+}
+
+int clcgs(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	int n_size = B.size();
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	std::complex<lcg_float> ak, rhok2, sigma, betak, rkmod;
+	Eigen::VectorXcd rk(n_size), s0, pk(n_size);
+	Eigen::VectorXcd Ax(n_size), uk(n_size), qk(n_size), wk(n_size);
+
+	Afp(instance, m, Ax, MatNormal, NonConjugate);
+
+	pk = uk = rk = (B - Ax);
+
+	std::complex<lcg_float> rhok;
+	do
+	{
+		s0 = Eigen::VectorXcd::Random(n_size);
+		rhok = s0.conjugate().dot(rk);
+	} while (std::sqrt(std::norm(rhok)) < 1e-8);
+
+	lcg_float rk_mod = std::norm(rk.dot(rk));
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, &m, residual, &para, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, pk, Ax, MatNormal, NonConjugate);
+		sigma = s0.conjugate().dot(Ax);
+		ak = rhok/sigma;
+
+		qk = uk - ak*Ax;
+		wk = uk + qk;
+
+		Afp(instance, wk, Ax, MatNormal, NonConjugate);
+
+		m += ak*wk;
+		rk -= ak*Ax;
+
+		rk_mod = std::norm(rk.dot(rk));
+
+		rhok2 = s0.conjugate().dot(rk);
+		betak = rhok2/rhok;
+		rhok = rhok2;
+
+		uk = rk + betak*qk;
+		pk = uk + betak*(qk + betak*pk);
+	}
+
+	func_ends:
+	{
+		rk.resize(0);
+		s0.resize(0);
+		pk.resize(0);
+		Ax.resize(0);
+		uk.resize(0);
+		qk.resize(0);
+		wk.resize(0);
+	}
+
+	return ret;
+}
+
+int cltfqmr(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	int n_size = B.size();
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	int j;
+	Eigen::VectorXcd pk(n_size), uk(n_size), vk(n_size), dk(n_size);
+	Eigen::VectorXcd r0(n_size), rk(n_size), Ax(n_size), qk(n_size);
+	Eigen::VectorXcd uqk(n_size);
+
+	Afp(instance, m, Ax, MatNormal, NonConjugate);
+
+	pk = uk = r0 = rk = (B - Ax);
+	dk.setZero();
+
+	std::complex<lcg_float> rk_mod = rk.dot(rk);
+	lcg_float r0_mod = std::norm(rk_mod);
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	lcg_float theta = 0.0, omega = sqrt(rk_mod.real());
+	lcg_float residual, tao = omega;
+	std::complex<lcg_float> rk_mod2, sigma, alpha, betak, rho, rho2, sign, eta(0.0, 0.0);
+
+	rho = r0.dot(r0);
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(std::norm(rk_mod))/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, sqrt(std::norm(rk_mod))/n_size, &para, 0);
+		}
+		goto func_ends;
+	}	
+	else if (std::norm(rk_mod)/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, std::norm(rk_mod)/r0_mod, &para, 0);
+		}
+		goto func_ends;
+	}
+
+	while(1)
+	{
+		Afp(instance, pk, vk, MatNormal, NonConjugate);
+
+		sigma = r0.dot(vk);
+		alpha = rho/sigma;
+
+		qk = uk - alpha*vk;
+		uqk = uk + qk;
+
+		Afp(instance, uqk, Ax, MatNormal, NonConjugate);
+
+		rk -= alpha*Ax;
+		rk_mod2 = rk.dot(rk);
+
+		for (j = 1; j <= 2; j++)
+		{
+			if (para.abs_diff) residual = std::sqrt(std::norm(rk_mod))/n_size;
+			else residual = std::norm(rk_mod)/r0_mod;
+
+			if (Pfp != nullptr)
+			{
+				if (Pfp(instance, &m, residual, &para, t))
+				{
+					ret = CLCG_STOP; goto func_ends;
+				}
+			}
+
+			if (residual <= para.epsilon)
+			{
+				ret = CLCG_CONVERGENCE; goto func_ends;
+			}
+
+			if (para.max_iterations > 0 && t+1 > para.max_iterations)
+			{
+				ret = LCG_REACHED_MAX_ITERATIONS;
+				break;
+			}
+			
+			t++;
+
+			sign = theta*theta*(eta/alpha);
+
+			if (j == 1)
+			{
+				omega = sqrt(sqrt(rk_mod.real())*sqrt(rk_mod2.real()));
+				dk = uk + sign*dk;
+			}
+			else
+			{
+				omega = sqrt(rk_mod2.real());
+				dk = qk + sign*dk;
+			}
+
+			theta = omega/tao;
+			tao = omega/sqrt(1.0+theta*theta);
+			eta = (1.0/(1.0+theta*theta))*alpha;
+
+			m += eta*dk;
+		}
+		rk_mod = rk_mod2;
+
+		rho2 = r0.dot(rk);
+		betak = rho2/rho;
+		rho = rho2;
+
+		uk = rk + betak*qk;
+		pk = uk + betak*(qk + betak*pk);
+	}
+
+	func_ends:
+	{
+		pk.resize(0);
+		uk.resize(0);
+		vk.resize(0);
+		dk.resize(0);
+		r0.resize(0);
+		rk.resize(0);
+		Ax.resize(0);
+		qk.resize(0);
+		uqk.resize(0);
+	}
+
+	return ret;
+}
+
+int clpcg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	int n_size = B.size();
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	Eigen::VectorXcd rk(n_size), dk(n_size), sk(n_size), Ax(n_size);
+
+	Afp(instance, m, Ax, MatNormal, NonConjugate);
+
+	rk = (B - Ax);
+	Mfp(instance, rk, dk, MatNormal, NonConjugate);
+
+	std::complex<lcg_float> ak, d_old, betak, dkAx;
+	std::complex<lcg_float> d_new = rk.conjugate().dot(dk);
+
+	lcg_float rk_mod = std::norm(rk.dot(rk));
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, &m, residual, &para, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, dk, Ax, MatNormal, NonConjugate);
+		dkAx = dk.conjugate().dot(Ax);
+		ak = d_new/dkAx;
+
+		m += ak*dk;
+		rk -= ak*Ax;
+
+		rk_mod = std::norm(rk.dot(rk));
+
+		Mfp(instance, rk, sk, MatNormal, NonConjugate);
+
+		d_old = d_new;
+		d_new = rk.conjugate().dot(sk);
+
+		betak = d_new/d_old;
+
+		dk = sk + betak*dk;
+	}
+
+	func_ends:
+	{
+		rk.resize(0);
+		dk.resize(0);
+		sk.resize(0);
+		Ax.resize(0);
+	}
+
+	return ret;
+}
+
+int clpbicg(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance)
+{
+	// set CGS parameters
+	clcg_para para = (param != nullptr) ? (*param) : defparam2;
+
+	int n_size = B.size();
+	//check parameters
+	if (n_size <= 0) return CLCG_INVILAD_VARIABLE_SIZE;
+	if (n_size != m.size()) return CLCG_SIZE_NOT_MATCH;
+	if (para.max_iterations < 0) return CLCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return CLCG_INVILAD_EPSILON;
+
+	std::complex<lcg_float> ak, betak, pkAx, rhok2;
+	Eigen::VectorXcd rk(n_size), rsk(n_size), zk(n_size), pk(n_size), psk(n_size), Ax(n_size), Asx(n_size);
+
+	Afp(instance, m, Ax, MatNormal, NonConjugate);
+
+	rk = (B - Ax);
+	Mfp(instance, rk, zk, MatNormal, NonConjugate);
+
+	pk = zk;
+	rsk = rk.conjugate();
+	psk = pk.conjugate();
+
+	std::complex<lcg_float> rhok = rsk.dot(zk);
+
+	lcg_float rk_mod = std::norm(rk.dot(rk));
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, sqrt(rk_mod)/n_size, &para, 0);
+		}
+		goto func_ends;
+	}	
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, &m, rk_mod/r0_mod, &para, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = std::sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, &m, residual, &para, t))
+			{
+				ret = CLCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = CLCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		Afp(instance, pk, Ax, MatNormal, NonConjugate);
+		Afp(instance, psk, Asx, MatNormal, Conjugate);
+
+		pkAx = psk.dot(Ax);
+		ak = rhok/pkAx;
+
+		m += ak*pk;
+		rsk = rk.conjugate() - std::conj(ak)*Asx;
+		rk -= ak*Ax;
+
+		rk_mod = std::norm(rk.dot(rk));
+
+		Mfp(instance, rk, zk, MatNormal, NonConjugate);
+
+		rhok2 = rsk.dot(zk);
+		betak = rhok2/rhok;
+		rhok = rhok2;
+
+		pk = zk + betak*pk;
+		psk = zk.conjugate() + std::conj(betak)*psk;
+	}
+
+	func_ends:
+	{
+		rk.resize(0);
+		rsk.resize(0);
+		zk.resize(0);
+		pk.resize(0);
+		psk.resize(0);
+		Ax.resize(0);
+		Asx.resize(0);
+	}
+
+	return ret;
+}
--- a/src/lib/clcg_eigen.h
+++ b/src/lib/clcg_eigen.h
@@ -0,0 +1,94 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _CLCG_EIGEN_H
+#define _CLCG_EIGEN_H
+
+#include "util.h"
+#include "complex"
+#include "Eigen/Dense"
+
+/**
+ * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
+ * by a vertical vector 'x'.
+ * 
+ * @param  instance    The user data sent for the solver functions by the client.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ * @param  layout      layout information of the matrix A passed by the solver functions.
+ * @param  conjugate   Layout information of the matrix A passed by the solver functions.
+ */
+typedef void (*clcg_axfunc_eigen_ptr)(void* instance, const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
+	lcg_matrix_e layout, clcg_complex_e conjugate);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary.
+ * 
+ * @param    instance    The user data sent for the solver functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    param       The parameter object passed by the solver functions.
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*clcg_progress_eigen_ptr)(void* instance, const Eigen::VectorXcd *m, const lcg_float converge, 
+	const clcg_para *param, const int k);
+
+/**
+ * @brief      A combined conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the solver function by the client. 
+ * This variable is either 'this' for class member functions or 'nullptr' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is CLCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver_eigen(clcg_axfunc_eigen_ptr Afp, clcg_progress_eigen_ptr Pfp, Eigen::VectorXcd &m, 
+	const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id = CLCG_CGS);
+
+/**
+ * @brief      A combined conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'M^{-1}x', in which M is the preconditioning matrix
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the solver function by the client. 
+ * This variable is either 'this' for class member functions or 'nullptr' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. the value must CLCG_PBICG (default) or CLCG_PCG.
+ *
+ * @return     Status of the function.
+ */
+int clcg_solver_preconditioned_eigen(clcg_axfunc_eigen_ptr Afp, clcg_axfunc_eigen_ptr Mfp, clcg_progress_eigen_ptr Pfp, 
+    Eigen::VectorXcd &m, const Eigen::VectorXcd &B, const clcg_para* param, void* instance, clcg_solver_enum solver_id = CLCG_PBICG);
+
+#endif // _CLCG_EIGEN_H
--- a/src/lib/lcg.cpp
+++ b/src/lib/lcg.cpp
--- a/src/lib/lcg.h
+++ b/src/lib/lcg.h
@@ -0,0 +1,171 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _LCG_H
+#define _LCG_H
+
+#include "util.h"
+
+/**
+ * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
+ * by a vertical vector 'x'.
+ * 
+ * @param  instance    The user data sent for the lcg_solver() functions by the client.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ * @param  n_size      Size of x and column/row numbers of A.
+ */
+typedef void (*lcg_axfunc_ptr)(void* instance, const lcg_float* x, lcg_float* prod_Ax, 
+	const int n_size);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary.
+ * 
+ * @param    instance    The user data sent for the lcg_solver() functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    n_size      The size of the variables
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*lcg_progress_ptr)(void* instance, const lcg_float* m, const lcg_float converge, 
+	const lcg_para* param, const int n_size, const int k);
+
+/**
+ * @brief      A combined conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
+	const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_CGS);
+
+/**
+ * @brief      A combined conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'M^{-1}x', in which M is the preconditioning matrix.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_PCG.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_preconditioned(lcg_axfunc_ptr Afp, lcg_axfunc_ptr Mfp, lcg_progress_ptr Pfp, lcg_float* m, 
+	const lcg_float* B, const int n_size, const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_PCG);
+
+/**
+ * @brief      A combined conjugate gradient solver function with inequality constraints.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  low         The lower boundary of the acceptable solution.
+ * @param[in]  hig         The higher boundary of the acceptable solution.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ * @param      P           Precondition vector (optional expect for the LCG_PCG method). The default value is NULL.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_constrained(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, 
+	const lcg_float* low, const lcg_float *hig, const int n_size, const lcg_para* param, 
+	void* instance, lcg_solver_enum solver_id = LCG_PG);
+
+/**
+ * @brief      Standalone function of the Linear Conjugate Gradient algorithm
+ * 
+ * @note       To use the lcg() function for massive inversions, it is better to provide 
+ * external vectors Gk, Dk and ADk to avoid allocating and destroying temporary vectors.
+ *
+ * @param[in]  Afp       Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp       Callback function for monitoring the iteration progress.
+ * @param      m         Initial solution vector of the size n_size
+ * @param[in]  B         Objective vector of the linear system.
+ * @param[in]  n_size    Size of the solution vector and objective vector.
+ * @param[in]  param     Parameter setup for the conjugate gradient methods.
+ * @param      instance  The user data sent for the lcg() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      Gk        Conjugate gradient vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      Dk        Directional gradient vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      ADk       Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ *
+ * @return     Status of the function.
+ */
+int lcg(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
+    const lcg_para* param, void* instance, lcg_float* Gk = nullptr, lcg_float* Dk = nullptr, 
+    lcg_float* ADk = nullptr);
+
+
+/**
+ * @brief      Standalone function of the Conjugate Gradient Squared algorithm.
+ * 
+ * @note       Algorithm 2 in "Generalized conjugate gradient method" by Fokkema et al. (1996).
+ * 
+ * @note       To use the lcgs() function for massive inversions, it is better to provide 
+ * external vectors RK, R0T, PK, AX, UK, QK, and WK to avoid allocating and destroying temporary vectors.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'nullptr' for global functions.
+ * @param      RK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      R0T         Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      PK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      AX          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      UK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      QK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ * @param      WK          Intermediate vector of the size n_size. If this pointer is null, the function will create an internal vector instead.
+ *
+ * @return     Status of the function.
+ */
+int lcgs(lcg_axfunc_ptr Afp, lcg_progress_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
+    const lcg_para* param, void* instance, lcg_float* RK = nullptr, lcg_float* R0T = nullptr, 
+    lcg_float* PK = nullptr, lcg_float* AX = nullptr, lcg_float* UK = nullptr, lcg_float* QK = nullptr, 
+    lcg_float* WK = nullptr);
+
+#endif // _LCG_H
--- a/src/lib/lcg_complex.cpp
+++ b/src/lib/lcg_complex.cpp
@@ -0,0 +1,496 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "cmath"
+#include "ctime"
+#include "random"
+
+#include "lcg_complex.h"
+
+#ifdef LibLCG_OPENMP
+#include "omp.h"
+#endif
+
+lcg_complex* clcg_malloc(int n)
+{
+	lcg_complex *x = new lcg_complex [n];
+	return x;
+}
+
+lcg_complex** clcg_malloc(int m, int n)
+{
+	lcg_complex **x = new lcg_complex* [m];
+	for (int i = 0; i < m; i++)
+	{
+		x[i] = new lcg_complex [n];
+	}
+	return x;
+}
+
+void clcg_free(lcg_complex* x)
+{
+	if (x != nullptr)
+	{
+		delete[] x; x = nullptr;
+	}
+	return;
+}
+
+void clcg_free(lcg_complex **x, int m)
+{
+	if (x != nullptr)
+	{
+		for (int i = 0; i < m; i++)
+		{
+			delete[] x[i];
+		}
+		delete[] x;
+		x = nullptr;
+	}
+	return;
+}
+
+void clcg_vecset(lcg_complex *a, lcg_complex b, int size)
+{
+	for (int i = 0; i < size; i++)
+	{
+		a[i] = b;
+	}
+	return;
+}
+
+void clcg_vecset(lcg_complex **a, lcg_complex b, int m, int n)
+{
+	for (int i = 0; i < m; ++i)
+	{
+		for (int j = 0; j < n; ++j)
+		{
+			a[i][j] = b;
+		}
+	}
+	return;
+}
+
+#ifdef LibLCG_STD_COMPLEX
+
+void clcg_set(lcg_complex *a, lcg_float r, lcg_float i)
+{
+	a->real(r);
+	a->imag(i);
+	return;
+}
+
+lcg_float clcg_square(const lcg_complex *a)
+{
+	return std::norm(*a);
+}
+
+lcg_float clcg_module(const lcg_complex *a)
+{
+	return sqrt(std::norm(*a));
+}
+
+lcg_complex clcg_conjugate(const lcg_complex *a)
+{
+	lcg_complex b = std::conj(*a);
+	return b;
+}
+
+void clcg_vecrnd(lcg_complex *a, lcg_complex l, lcg_complex h, int size)
+{
+	srand(time(0));
+	for (int i = 0; i < size; i++)
+	{
+		a[i].real((h.real()-l.real())*rand()*1.0/RAND_MAX + l.real());
+		a[i].imag((h.imag()-l.imag())*rand()*1.0/RAND_MAX + l.imag());
+	}
+	return;
+}
+
+void clcg_vecrnd(lcg_complex **a, lcg_complex l, lcg_complex h, int m, int n)
+{
+	srand(time(0));
+	for (int i = 0; i < m; i++)
+	{
+		for (int j = 0; j < n; j++)
+		{
+			a[i][j].real((h.real()-l.real())*rand()*1.0/RAND_MAX + l.real());
+			a[i][j].imag((h.imag()-l.imag())*rand()*1.0/RAND_MAX + l.imag());	
+		}
+	}
+	return;
+}
+
+void clcg_dot(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
+{
+	lcg_float re = 0.0, im = 0.0;
+	// <a,b> = \sum{a_i \cdot b_i}
+	for (int i = 0; i < size; i++)
+	{
+		re += (a[i].real()*b[i].real() - a[i].imag()*b[i].imag());
+		im += (a[i].real()*b[i].imag() + a[i].imag()*b[i].real());
+	}
+	ret.real(re); ret.imag(im);
+	return;
+}
+
+void clcg_inner(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
+{
+	lcg_float re = 0.0, im = 0.0;
+	// <a,b> = \sum{\bar{a_i} \cdot b_i}
+	for (int i = 0; i < size; i++)
+	{
+		re += (a[i].real()*b[i].real() + a[i].imag()*b[i].imag());
+		im += (a[i].real()*b[i].imag() - a[i].imag()*b[i].real());
+	}
+	ret.real(re); ret.imag(im);
+	return;
+}
+
+void clcg_matvec(lcg_complex **A, const lcg_complex *x, lcg_complex *Ax, 
+	int m_size, int n_size, lcg_matrix_e layout, clcg_complex_e conjugate)
+{
+	size_t i, j;
+	lcg_float re, im;
+	if (conjugate == Conjugate)
+	{
+		if (layout == MatNormal)
+		{
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+			for (i = 0; i < m_size; i++)
+			{
+				re = 0.0; im = 0.0;
+				for (j = 0; j < n_size; j++)
+				{
+					re += (A[i][j].real()*x[j].real() + A[i][j].imag()*x[j].imag());
+					im += (A[i][j].real()*x[j].imag() - A[i][j].imag()*x[j].real());
+				}
+				Ax[i].real(re); Ax[i].imag(im);
+			}
+			return;
+		}
+
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+		for (j = 0; j < n_size; j++)
+		{
+			re = 0.0; im = 0.0;
+			for (i = 0; i < m_size; i++)
+			{
+				re += (A[i][j].real()*x[i].real() + A[i][j].imag()*x[i].imag());
+				im += (A[i][j].real()*x[i].imag() - A[i][j].imag()*x[i].real());
+			}
+			Ax[j].real(re); Ax[j].imag(im);
+		}
+		return;
+	}
+
+	if (layout == MatNormal)
+	{
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+		for (i = 0; i < m_size; i++)
+		{
+			re = 0.0; im = 0.0;
+			for (j = 0; j < n_size; j++)
+			{
+				re += (A[i][j].real()*x[j].real() - A[i][j].imag()*x[j].imag());
+				im += (A[i][j].real()*x[j].imag() + A[i][j].imag()*x[j].real());
+			}
+			Ax[i].real(re); Ax[i].imag(im);
+		}
+		return;
+	}
+
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+	for (j = 0; j < n_size; j++)
+	{
+		re = 0.0; im = 0.0;
+		for (i = 0; i < m_size; i++)
+		{
+			re += (A[i][j].real()*x[i].real() - A[i][j].imag()*x[i].imag());
+			im += (A[i][j].real()*x[i].imag() + A[i][j].imag()*x[i].real());
+		}
+		Ax[j].real(re); Ax[j].imag(im);
+	}
+	return;
+}
+
+#else
+
+lcg_complex::lcg_complex()
+{
+	rel = img = 0.0;
+}
+
+lcg_complex::lcg_complex(lcg_float r, lcg_float i)
+{
+	rel = r; img = i;
+}
+
+lcg_complex::~lcg_complex(){}
+
+void lcg_complex::real(lcg_float a)
+{
+	rel = a;
+	return;
+}
+
+void lcg_complex::imag(lcg_float a)
+{
+	img = a;
+	return;
+}
+
+lcg_float lcg_complex::real()
+{
+	return rel;
+}
+
+lcg_float lcg_complex::imag()
+{
+	return img;
+}
+
+bool operator==(const lcg_complex &a, const lcg_complex &b)
+{
+	if (a.rel == b.rel && a.img == b.img)
+		return true;
+	return false;
+}
+
+bool operator!=(const lcg_complex &a, const lcg_complex &b)
+{
+	if (a.rel != b.rel || a.img != b.img)
+		return true;
+	return false;
+}
+
+lcg_complex operator+(const lcg_complex &a, const lcg_complex &b)
+{
+	lcg_complex ret;
+	ret.rel = a.rel + b.rel;
+	ret.img = a.img + b.img;
+	return ret;
+}
+
+lcg_complex operator-(const lcg_complex &a, const lcg_complex &b)
+{
+	lcg_complex ret;
+	ret.rel = a.rel - b.rel;
+	ret.img = a.img - b.img;
+	return ret;
+}
+
+lcg_complex operator*(const lcg_complex &a, const lcg_complex &b)
+{
+	lcg_complex ret;
+	ret.rel = a.rel*b.rel - a.img*b.img;
+	ret.img = a.rel*b.img + a.img*b.rel;
+	return ret;
+}
+
+lcg_complex operator*(const lcg_float &a, const lcg_complex &b)
+{
+	lcg_complex ret;
+	ret.rel = a*b.rel;
+	ret.img = a*b.img;
+	return ret;
+}
+
+lcg_complex operator/(const lcg_complex &a, const lcg_complex &b)
+{
+	lcg_complex ret;
+	if (b.rel == 0 && b.img == 0)
+	{
+		ret.rel = ret.img = NAN;
+		return ret;
+	}
+
+	ret.rel = (a.rel*b.rel + a.img*b.img)/(b.rel*b.rel + b.img*b.img);
+	ret.img = (a.img*b.rel - a.rel*b.img)/(b.rel*b.rel + b.img*b.img);
+	return ret;
+}
+
+lcg_complex operator/(const lcg_float &a, const lcg_complex &b)
+{
+	lcg_complex ret;
+	if (b.rel == 0 && b.img == 0)
+	{
+		ret.rel = ret.img = NAN;
+		return ret;
+	}
+
+	ret.rel = a*b.rel/(b.rel*b.rel + b.img*b.img);
+	ret.img = -1.0*a*b.img/(b.rel*b.rel + b.img*b.img);
+	return ret;
+}
+
+std::ostream &operator<<(std::ostream &os, const lcg_complex &a)
+{
+	if (a.img >= 0)
+		os << a.rel << "+" << a.img << "i";
+	else
+		os << a.rel << a.img << "i";
+	return os;
+}
+
+void clcg_set(lcg_complex *a, lcg_float r, lcg_float i)
+{
+	a->rel = r;
+	a->img = i;
+	return;
+}
+
+lcg_float clcg_square(const lcg_complex *a)
+{
+	return a->rel * a->rel + a->img * a->img;
+}
+
+lcg_float clcg_module(const lcg_complex *a)
+{
+	return sqrt(clcg_square(a));
+}
+
+lcg_complex clcg_conjugate(const lcg_complex *a)
+{
+	lcg_complex b;
+	b.rel = a->rel;
+	b.img = -1.0 * a->img;
+	return b;
+}
+
+void clcg_vecrnd(lcg_complex *a, lcg_complex l, lcg_complex h, int size)
+{
+	srand(time(nullptr));
+	for (int i = 0; i < size; i++)
+	{
+		a[i].rel = (h.rel-l.rel)*rand()*1.0/RAND_MAX + l.rel;
+		a[i].img = (h.img-l.img)*rand()*1.0/RAND_MAX + l.img;
+	}
+	return;
+}
+
+void clcg_vecrnd(lcg_complex **a, lcg_complex l, lcg_complex h, int m, int n)
+{
+	srand(time(nullptr));
+	for (int i = 0; i < m; i++)
+	{
+		for (int j = 0; j < n; j++)
+		{
+			a[i][j].rel = (h.rel-l.rel)*rand()*1.0/RAND_MAX + l.rel;
+			a[i][j].img = (h.img-l.img)*rand()*1.0/RAND_MAX + l.img;	
+		}
+	}
+	return;
+}
+
+void clcg_dot(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
+{
+	clcg_set(&ret, 0.0, 0.0);
+	// <a,b> = \sum{a_i \cdot b_i}
+	for (int i = 0; i < size; i++)
+	{
+		ret.rel += (a[i].rel*b[i].rel - a[i].img*b[i].img);
+		ret.img += (a[i].rel*b[i].img + a[i].img*b[i].rel);
+	}
+	return;
+}
+
+void clcg_inner(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size)
+{
+	clcg_set(&ret, 0.0, 0.0);
+	// <a,b> = \sum{\bar{a_i} \cdot b_i}
+	for (int i = 0; i < size; i++)
+	{
+		ret.rel += (a[i].rel*b[i].rel + a[i].img*b[i].img);
+		ret.img += (a[i].rel*b[i].img - a[i].img*b[i].rel);
+	}
+	return;
+}
+
+void clcg_matvec(lcg_complex **A, const lcg_complex *x, lcg_complex *Ax, 
+	int m_size, int n_size, lcg_matrix_e layout, clcg_complex_e conjugate)
+{
+	int i, j;
+	lcg_float re, im;
+	if (conjugate == Conjugate)
+	{
+		if (layout == MatNormal)
+		{
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+			for (i = 0; i < m_size; i++)
+			{
+				re = 0.0; im = 0.0;
+				for (j = 0; j < n_size; j++)
+				{
+					re += (A[i][j].rel*x[j].rel + A[i][j].img*x[j].img);
+					im += (A[i][j].rel*x[j].img - A[i][j].img*x[j].rel);
+				}
+				clcg_set(&Ax[i], re, im);
+			}
+			return;
+		}
+
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+		for (j = 0; j < n_size; j++)
+		{
+			re = 0.0; im = 0.0;
+			for (i = 0; i < m_size; i++)
+			{
+				re += (A[i][j].rel*x[i].rel + A[i][j].img*x[i].img);
+				im += (A[i][j].rel*x[i].img - A[i][j].img*x[i].rel);
+			}
+			clcg_set(&Ax[j], re, im);
+		}
+		return;
+	}
+
+	if (layout == MatNormal)
+	{
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+		for (i = 0; i < m_size; i++)
+		{
+			re = 0.0; im = 0.0;
+			for (j = 0; j < n_size; j++)
+			{
+				re += (A[i][j].rel*x[j].rel - A[i][j].img*x[j].img);
+				im += (A[i][j].rel*x[j].img + A[i][j].img*x[j].rel);
+			}
+			clcg_set(&Ax[i], re, im);
+		}
+		return;
+	}
+
+#pragma omp parallel for private (i, j, re, im) schedule(guided)
+	for (j = 0; j < n_size; j++)
+	{
+		re = 0.0; im = 0.0;
+		for (i = 0; i < m_size; i++)
+		{
+			re+= (A[i][j].rel*x[i].rel - A[i][j].img*x[i].img);
+			im += (A[i][j].rel*x[i].img + A[i][j].img*x[i].rel);
+		}
+		clcg_set(&Ax[j], re, im);
+	}
+	return;
+}
+
+#endif // LibLCG_SYSTEM_COMPLEX
--- a/src/lib/lcg_complex.h
+++ b/src/lib/lcg_complex.h
@@ -0,0 +1,329 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _LCG_COMPLEX_H
+#define _LCG_COMPLEX_H
+
+#include "iostream"
+
+#include "algebra.h"
+#ifdef LibLCG_STD_COMPLEX
+
+#include "complex"
+
+typedef std::complex<lcg_float> lcg_complex;
+
+#else
+
+/**
+ * @brief     A simple definition of the complex number type. 
+ * Easy to change in the future. Right now it is just two double variables
+ */
+struct lcg_complex
+{
+	lcg_float rel; ///< The real part
+	lcg_float img; ///< The imaginary part
+
+	/**
+	 * @brief      Constructs a new instance.
+	 */
+	lcg_complex();
+	/**
+	 * @brief      Constructs a new instance.
+	 *
+	 * @param[in]  r     The real part of the complex number
+	 * @param[in]  i     The imaginary part of the complex number
+	 */
+	lcg_complex(lcg_float r, lcg_float i);
+	/**
+	 * @brief      Destructor
+	 */
+	virtual ~lcg_complex();
+
+	/**
+	 * @brief      Set real part of a complex number
+	 * 
+	 * @param a    Input value
+	 */
+	void real(lcg_float a);
+
+	/**
+	 * @brief     Set image part of a complex number
+	 * 
+	 * @param a   Input value
+	 */
+	void imag(lcg_float a);
+
+	/**
+	 * @brief    Get real part of a complex number
+	 * 
+	 * @return lcg_float Real component
+	 */
+	lcg_float real();
+
+	/**
+	 * @brief    Get image part of a complex number
+	 * 
+	 * @return lcg_float Image component
+	 */
+	lcg_float imag();
+};
+
+/**
+ * @brief      Reload equality operator.
+ *
+ * @param[in]  a     complex number a
+ * @param[in]  b     complex number b
+ *
+ * @return     equal or not
+ */
+bool operator==(const lcg_complex &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload inequality operator.
+ *
+ * @param[in]  a     complex number a
+ * @param[in]  b     complex number b
+ *
+ * @return     unequal or not
+ */
+bool operator!=(const lcg_complex &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload addition operator.
+ *
+ * @param[in]  a     complex number a
+ * @param[in]  b     complex number b
+ *
+ * @return     sum
+ */
+lcg_complex operator+(const lcg_complex &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload subtraction operator.
+ *
+ * @param[in]  a     complex number a
+ * @param[in]  b     complex number b
+ *
+ * @return     subtraction
+ */
+lcg_complex operator-(const lcg_complex &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload multiplication operator.
+ *
+ * @param[in]  a     complex number a
+ * @param[in]  b     complex number b
+ *
+ * @return     product
+ */
+lcg_complex operator*(const lcg_complex &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload multiplication operator.
+ *
+ * @param[in]  a     real number a
+ * @param[in]  b     complex number b
+ *
+ * @return     product
+ */
+lcg_complex operator*(const lcg_float &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload division operator.
+ *
+ * @param[in]  a     complex number a
+ * @param[in]  b     complex number b
+ *
+ * @return     quotient
+ */
+lcg_complex operator/(const lcg_complex &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload division operator.
+ *
+ * @param[in]  a     real number a
+ * @param[in]  b     complex number b
+ *
+ * @return     quotient
+ */
+lcg_complex operator/(const lcg_float &a, const lcg_complex &b);
+
+/**
+ * @brief      Reload ostream operator.
+ *
+ * @param      os    The ostream
+ * @param[in]  a     complex number a
+ *
+ * @return     The ostream
+ */
+std::ostream &operator<<(std::ostream &os, const lcg_complex &a);
+
+#endif // LibLCG_STD_COMPLEX
+
+/**
+ * @brief      Locate memory for a lcg_complex pointer type.
+ *
+ * @param[in]  n     Size of the lcg_float array.
+ *
+ * @return     Pointer of the array's location.
+ */
+lcg_complex* clcg_malloc(int n);
+
+/**
+ * @brief      Locate memory for a lcg_complex second pointer type.
+ *
+ * @param[in]  n     Size of the lcg_float array.
+ *
+ * @return     Pointer of the array's location.
+ */
+lcg_complex** clcg_malloc(int m, int n);
+
+/**
+ * @brief      Destroy memory used by the lcg_complex type array.
+ *
+ * @param      x     Pointer of the array.
+ */
+void clcg_free(lcg_complex* x);
+
+/**
+ * @brief      Destroy memory used by the 2D lcg_complex type array.
+ *
+ * @param      x     Pointer of the array.
+ */
+void clcg_free(lcg_complex **x, int m);
+
+/**
+ * @brief      set a complex vector's value
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  b     initial value
+ * @param[in]  size  vector size
+ */
+void clcg_vecset(lcg_complex *a, lcg_complex b, int size);
+
+/**
+ * @brief      set a 2d complex vector's value
+ *
+ * @param      a     pointer of the matrix
+ * @param[in]  b     initial value
+ * @param[in]  m     row size of the matrix
+ * @param[in]  n     column size of the matrix
+ */
+void clcg_vecset(lcg_complex **a, lcg_complex b, int m, int n);
+
+/**
+ * @brief      setup a complex number
+ *
+ * @param[in]  r     The real part of the complex number
+ * @param[in]  i     The imaginary part of the complex number
+ */
+void clcg_set(lcg_complex *a, lcg_float r, lcg_float i);
+
+/**
+ * @brief      Calculate the squared module of a complex number
+ *
+ * @return     The module
+ */
+lcg_float clcg_square(const lcg_complex *a);
+/**
+ * @brief      Calculate the module of a complex number
+ *
+ * @return     The module
+ */
+lcg_float clcg_module(const lcg_complex *a);
+/**
+ * @brief      Calculate the conjugate of a complex number
+ *
+ * @return     The complex conjugate.
+ */
+lcg_complex clcg_conjugate(const lcg_complex *a);
+
+/**
+ * @brief      set a complex vector using random values
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  l     the lower bound of random values
+ * @param[in]  h     the higher bound of random values
+ * @param[in]  size  size of the vector
+ */
+void clcg_vecrnd(lcg_complex *a, lcg_complex l, lcg_complex h, int size);
+
+/**
+ * @brief      set a 2D complex vector using random values
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  l     the lower bound of random values
+ * @param[in]  h     the higher bound of random values
+ * @param[in]  m     row size of the vector
+ * @param[in]  n     column size of the vector
+ */
+void clcg_vecrnd(lcg_complex **a, lcg_complex l, lcg_complex h, int m, int n);
+
+/**
+ * @brief      calculate dot product of two complex vectors
+ * 
+ * the product of two complex vectors are defined as <a, b> = \sum{a_i \cdot b_i}
+ *
+ * @param[in]  a       complex vector a
+ * @param[in]  b       complex vector b
+ * @param[in]  x_size  size of the vector
+ *
+ * @return     product
+ */
+void clcg_dot(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size);
+
+/**
+ * @brief      calculate inner product of two complex vectors
+ * 
+ * the product of two complex vectors are defined as <a, b> = \sum{\bar{a_i} \cdot b_i}
+ *
+ * @param[in]  a       complex vector a
+ * @param[in]  b       complex vector b
+ * @param[in]  x_size  size of the vector
+ *
+ * @return     product
+ */
+void clcg_inner(lcg_complex &ret, const lcg_complex *a, const lcg_complex *b, int size);
+
+/**
+ * @brief      calculate product of a complex matrix and a complex vector
+ * 
+ * the product of two complex vectors are defined as <a, b> = \sum{\bar{a_i}\cdot\b_i}.
+ * Different configurations:
+ * layout=Normal,conjugate=false -> A
+ * layout=Transpose,conjugate=false -> A^T
+ * layout=Normal,conjugate=true -> \bar{A}
+ * layout=Transpose,conjugate=true -> A^H
+ *
+ * @param      A          complex matrix A
+ * @param[in]  x          complex vector x
+ * @param      Ax         product of Ax
+ * @param[in]  m_size     row size of A
+ * @param[in]  n_size     column size of A
+ * @param[in]  layout     layout of A used for multiplication. Must be Normal or Transpose
+ * @param[in]  conjugate  whether to use the complex conjugate of A for calculation
+ */
+void clcg_matvec(lcg_complex **A, const lcg_complex *x, lcg_complex *Ax, int m_size, int n_size, 
+	lcg_matrix_e layout = MatNormal, clcg_complex_e conjugate = NonConjugate);
+
+#endif // _LCG_COMPLEX_H
--- a/src/lib/lcg_complex_cuda.cu
+++ b/src/lib/lcg_complex_cuda.cu
@@ -0,0 +1,356 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "lcg_complex_cuda.h"
+#include "complex"
+#include "map"
+
+__global__ void smCcsr_get_diagonal_device(const int *A_row, const int *A_col, const cuComplex *A_val, const int A_len, cuComplex *A_diag)
+{
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < A_len)
+	{
+		const int num_non0_row = A_row[i + 1] - A_row[i];
+
+		for (int j = 0; j < num_non0_row; j++)
+		{
+			if (A_col[j + A_row[i]] == i)
+			{
+				A_diag[i] = A_val[j + A_row[i]];
+				break;
+			}
+		}
+	}
+	return;
+}
+
+__global__ void smZcsr_get_diagonal_device(const int *A_row, const int *A_col, const cuDoubleComplex *A_val, const int A_len, cuDoubleComplex *A_diag)
+{
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < A_len)
+	{
+		const int num_non0_row = A_row[i + 1] - A_row[i];
+
+		for (int j = 0; j < num_non0_row; j++)
+		{
+			if (A_col[j + A_row[i]] == i)
+			{
+				A_diag[i] = A_val[j + A_row[i]];
+				break;
+			}
+		}
+	}
+	return;
+}
+
+__global__ void vecMvecC_element_wise_device(const cuComplex *a, const cuComplex *b, cuComplex *c, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		c[i] = cuCmulf(a[i], b[i]);
+	}
+	return;
+}
+
+__global__ void vecMvecZ_element_wise_device(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		c[i] = cuCmul(a[i], b[i]);
+	}
+	return;
+}
+
+__global__ void vecDvecC_element_wise_device(const cuComplex *a, const cuComplex *b, cuComplex *c, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		c[i] = cuCdivf(a[i], b[i]);
+	}
+	return;
+}
+
+__global__ void vecDvecZ_element_wise_device(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		c[i] = cuCdiv(a[i], b[i]);
+	}
+	return;
+}
+
+__global__ void vecC_conjugate_device(const cuComplex *a, cuComplex *ca, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		ca[i] = a[i];
+		ca[i].y *= -1.0;
+	}
+	return;
+}
+
+__global__ void vecZ_conjugate_device(const cuDoubleComplex *a, cuDoubleComplex *ca, int n)
+{
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < n)
+	{
+		ca[i] = a[i];
+		ca[i].y *= -1.0;
+	}
+	return;
+}
+
+lcg_complex cuda2lcg_complex(cuDoubleComplex a)
+{
+	return lcg_complex(a.x, a.y);
+}
+
+#ifdef LibLCG_STD_COMPLEX
+
+cuDoubleComplex lcg2cuda_complex(lcg_complex a)
+{
+	cuDoubleComplex o;
+	o.x = a.real(); o.y = a.imag();
+	return o;
+}
+
+#else
+
+cuDoubleComplex lcg2cuda_complex(lcg_complex a)
+{
+	cuDoubleComplex o;
+	o.x = a.rel(); o.y = a.img();
+	return o;
+}
+
+#endif // LibLCG_STD_COMPLEX
+
+cuDoubleComplex* clcg_malloc_cuda(size_t n)
+{
+	cuDoubleComplex *x = new cuDoubleComplex [n];
+	return x;
+}
+
+void clcg_free_cuda(cuDoubleComplex *x)
+{
+	if (x != nullptr)
+	{
+		delete[] x; x = nullptr;
+	}
+	return;
+}
+
+void clcg_vecset_cuda(cuDoubleComplex *a, cuDoubleComplex b, size_t size)
+{
+	for (size_t i = 0; i < size; i++)
+	{
+		a[i].x = b.x; a[i].y = b.y;
+	}
+	return;
+}
+
+cuComplex clcg_Cscale(float s, cuComplex a)
+{
+	cuComplex o;
+	o.x = s*a.x;
+	o.y = s*a.y;
+	return o;
+}
+
+cuComplex clcg_Csum(cuComplex a, cuComplex b)
+{
+	cuComplex o;
+	o.x = a.x + b.x;
+	o.y = a.y + b.y;
+	return o;
+}
+
+cuComplex clcg_Cdiff(cuComplex a, cuComplex b)
+{
+	cuComplex o;
+	o.x = a.x - b.x;
+	o.y = a.y - b.y;
+	return o;
+}
+
+cuComplex clcg_Csqrt(cuComplex a)
+{
+	std::complex<float> c = std::sqrt(std::complex<float>(a.x, a.y));
+	cuComplex s;
+	s.x = c.real(); s.y = c.imag();
+	return s;
+}
+
+cuDoubleComplex clcg_Zscale(lcg_float s, cuDoubleComplex a)
+{
+	cuDoubleComplex o;
+	o.x = s*a.x;
+	o.y = s*a.y;
+	return o;
+}
+
+cuDoubleComplex clcg_Zsum(cuDoubleComplex a, cuDoubleComplex b)
+{
+	cuDoubleComplex o;
+	o.x = a.x + b.x;
+	o.y = a.y + b.y;
+	return o;
+}
+
+cuDoubleComplex clcg_Zdiff(cuDoubleComplex a, cuDoubleComplex b)
+{
+	cuDoubleComplex o;
+	o.x = a.x - b.x;
+	o.y = a.y - b.y;
+	return o;
+}
+
+cuDoubleComplex clcg_Zsqrt(cuDoubleComplex a)
+{
+	std::complex<lcg_float> c = std::sqrt(std::complex<lcg_float>(a.x, a.y));
+	cuDoubleComplex s;
+	s.x = c.real(); s.y = c.imag();
+	return s;
+}
+
+void clcg_smCcoo_row2col(const int *A_row, const int *A_col, const cuComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuComplex *Ac_val)
+{
+	size_t i, order;
+	std::map<size_t, cuComplex> sort_map;
+	std::map<size_t, cuComplex>::iterator st_iter;
+
+	for (i = 0; i < nz; i++)
+	{
+		order = N*A_col[i] + A_row[i];
+		sort_map[order] = A[i];
+	}
+
+	i = 0;
+	for (st_iter = sort_map.begin(); st_iter != sort_map.end(); st_iter++)
+	{
+		order = st_iter->first;
+		// exchange the row and column indice to rotate the matrix
+		Ac_row[i] = order/N;
+		Ac_col[i] = order%N;
+		Ac_val[i] = st_iter->second;
+		i++;
+	}
+
+	sort_map.clear();
+	return;
+}
+
+void clcg_smZcoo_row2col(const int *A_row, const int *A_col, const cuDoubleComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuDoubleComplex *Ac_val)
+{
+	size_t i, order;
+	std::map<size_t, cuDoubleComplex> sort_map;
+	std::map<size_t, cuDoubleComplex>::iterator st_iter;
+
+	for (i = 0; i < nz; i++)
+	{
+		order = N*A_col[i] + A_row[i];
+		sort_map[order] = A[i];
+	}
+
+	i = 0;
+	for (st_iter = sort_map.begin(); st_iter != sort_map.end(); st_iter++)
+	{
+		order = st_iter->first;
+		// exchange the row and column indice to rotate the matrix
+		Ac_row[i] = order/N;
+		Ac_col[i] = order%N;
+		Ac_val[i] = st_iter->second;
+		i++;
+	}
+
+	sort_map.clear();
+	return;
+}
+
+void clcg_smCcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuComplex *A_val, const int A_len, cuComplex *A_diag, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (A_len + blockSize - 1) / blockSize;
+	smCcsr_get_diagonal_device<<<numBlocks, blockSize>>>(A_ptr, A_col, A_val, A_len, A_diag);
+	return;
+}
+
+void clcg_smZcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuDoubleComplex *A_val, const int A_len, cuDoubleComplex *A_diag, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (A_len + blockSize - 1) / blockSize;
+	smZcsr_get_diagonal_device<<<numBlocks, blockSize>>>(A_ptr, A_col, A_val, A_len, A_diag);
+	return;
+}
+
+void clcg_vecMvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	vecMvecC_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
+	return;
+}
+
+void clcg_vecMvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	vecMvecZ_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
+	return;
+}
+
+void clcg_vecDvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	vecDvecC_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
+	return;
+}
+
+void clcg_vecDvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	vecDvecZ_element_wise_device<<<numBlocks, blockSize>>>(a, b, c, n);
+	return;
+}
+
+void clcg_vecC_conjugate(const cuComplex *a, cuComplex *ca, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	vecC_conjugate_device<<<numBlocks, blockSize>>>(a, ca, n);
+	return;
+}
+
+void clcg_vecZ_conjugate(const cuDoubleComplex *a, cuDoubleComplex *ca, int n, int bk_size)
+{
+	int blockSize = bk_size;
+	int numBlocks = (n + blockSize - 1) / blockSize;
+	vecZ_conjugate_device<<<numBlocks, blockSize>>>(a, ca, n);
+	return;
+}
--- a/src/lib/lcg_complex_cuda.h
+++ b/src/lib/lcg_complex_cuda.h
@@ -0,0 +1,278 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _LCG_COMPLEX_CUDA_H
+#define _LCG_COMPLEX_CUDA_H
+
+#include "lcg_complex.h"
+
+#ifdef LibLCG_CUDA
+
+#include <cuda_runtime.h>
+#include <cuComplex.h>
+
+/**
+ * @brief  Convert cuda complex number to lcg complex number
+ * 
+ * @param a CUDA complex number
+ * @return lcg_complex  lcg complex number
+ */
+lcg_complex cuda2lcg_complex(cuDoubleComplex a);
+
+/**
+ * @brief Convert lcg complex number to CUDA complex number
+ * 
+ * @param a lcg complex number
+ * @return cuDoubleComplex CUDA complex number
+ */
+cuDoubleComplex lcg2cuda_complex(lcg_complex a);
+
+/**
+ * @brief      Locate memory for a cuDoubleComplex pointer type.
+ *
+ * @param[in]  n     Size of the lcg_float array.
+ *
+ * @return     Pointer of the array's location.
+ */
+cuDoubleComplex* clcg_malloc_cuda(size_t n);
+
+/**
+ * @brief      Destroy memory used by the cuDoubleComplex type array.
+ *
+ * @param      x     Pointer of the array.
+ */
+void clcg_free_cuda(cuDoubleComplex *x);
+
+/**
+ * @brief      set a complex vector's value
+ *
+ * @param      a     pointer of the vector
+ * @param[in]  b     initial value
+ * @param[in]  size  vector size
+ */
+void clcg_vecset_cuda(cuDoubleComplex *a, cuDoubleComplex b, size_t size);
+
+/**
+ * @brief    Host side function for scale a cuDoubleComplex object
+ * 
+ * @param s  scale factor
+ * @param a  Complex number
+ * @return cuComplex  scaled complex number
+ */
+cuComplex clcg_Cscale(lcg_float s, cuComplex a);
+
+/**
+ * @brief   Calculate the sum of two cuda complex number. This is a host side function.
+ * 
+ * @param a Complex number
+ * @param b Complex number
+ * @return cuComplex Sum of the input complex number 
+ */
+cuComplex clcg_Csum(cuComplex a, cuComplex b);
+
+/**
+ * @brief   Calculate the difference of two cuda complex number. This is a host side function.
+ * 
+ * @param a Complex number
+ * @param b Complex number
+ * @return cuComplex Difference of the input complex number 
+ */
+cuComplex clcg_Cdiff(cuComplex a, cuComplex b);
+
+/**
+ * @brief   Calculate the sqrt() of a cuda complex number
+ * 
+ * @param a Complex number
+ * @return cuComplex root value
+ */
+cuComplex clcg_Csqrt(cuComplex a);
+
+/**
+ * @brief    Host side function for scale a cuDoubleComplex object
+ * 
+ * @param s  scale factor
+ * @param a  Complex number
+ * @return cuDoubleComplex  scaled complex number
+ */
+cuDoubleComplex clcg_Zscale(lcg_float s, cuDoubleComplex a);
+
+/**
+ * @brief   Calculate the sum of two cuda complex number. This is a host side function.
+ * 
+ * @param a Complex number
+ * @param b Complex number
+ * @return cuDoubleComplex Sum of the input complex number 
+ */
+cuDoubleComplex clcg_Zsum(cuDoubleComplex a, cuDoubleComplex b);
+
+/**
+ * @brief   Calculate the difference of two cuda complex number. This is a host side function.
+ * 
+ * @param a Complex number
+ * @param b Complex number
+ * @return cuDoubleComplex Difference of the input complex number 
+ */
+cuDoubleComplex clcg_Zdiff(cuDoubleComplex a, cuDoubleComplex b);
+
+/**
+ * @brief   Calculate the sqrt() of a cuda complex number
+ * 
+ * @param a Complex number
+ * @return cuDoubleComplex root value
+ */
+cuDoubleComplex clcg_Zsqrt(cuDoubleComplex a);
+
+/**
+ * @brief   Convert the indexing sequence of a sparse matrix from the row-major to col-major format.
+ * 
+ * @note    The sparse matrix is stored in the COO foramt. This is a host side function.
+ * 
+ * @param A_row      Row index
+ * @param A_col      Column index
+ * @param A          Non-zero values of the matrix
+ * @param N          Row/column length of A
+ * @param nz         Number of the non-zero values in A
+ * @param Ac_row     Output row index
+ * @param Ac_col     Output column index
+ * @param Ac_val     Non-zero values of the output matrix
+ */
+void clcg_smCcoo_row2col(const int *A_row, const int *A_col, const cuComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuComplex *Ac_val);
+
+/**
+ * @brief   Convert the indexing sequence of a sparse matrix from the row-major to col-major format.
+ * 
+ * @note    The sparse matrix is stored in the COO foramt. This is a host side function.
+ * 
+ * @param A_row      Row index
+ * @param A_col      Column index
+ * @param A          Non-zero values of the matrix
+ * @param N          Row/column length of A
+ * @param nz         Number of the non-zero values in A
+ * @param Ac_row     Output row index
+ * @param Ac_col     Output column index
+ * @param Ac_val     Non-zero values of the output matrix
+ */
+void clcg_smZcoo_row2col(const int *A_row, const int *A_col, const cuDoubleComplex *A, int N, int nz, int *Ac_row, int *Ac_col, cuDoubleComplex *Ac_val);
+
+/**
+ * @brief      Extract diagonal elements from a square CUDA sparse matrix that is formatted in the CSR format
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  A_ptr   Row index pointer
+ * @param[in]  A_col   Column index
+ * @param[in]  A_val   Non-zero values of the matrix
+ * @param[in]  A_len   Dimension of the matrix
+ * @param      A_diag  Output digonal elements
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_smCcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuComplex *A_val, const int A_len, cuComplex *A_diag, int bk_size = 1024);
+
+/**
+ * @brief      Extract diagonal elements from a square CUDA sparse matrix that is formatted in the CSR format
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  A_ptr   Row index pointer
+ * @param[in]  A_col   Column index
+ * @param[in]  A_val   Non-zero values of the matrix
+ * @param[in]  A_len   Dimension of the matrix
+ * @param      A_diag  Output digonal elements
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_smZcsr_get_diagonal(const int *A_ptr, const int *A_col, const cuDoubleComplex *A_val, const int A_len, cuDoubleComplex *A_diag, int bk_size = 1024);
+
+/**
+ * @brief      Element-wise muplication between two CUDA arries.
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  a     Pointer of the input array
+ * @param[in]  b     Pointer of the input array
+ * @param      c     Pointer of the output array
+ * @param[in]  n     Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_vecMvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size = 1024);
+
+/**
+ * @brief      Element-wise muplication between two CUDA arries.
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  a     Pointer of the input array
+ * @param[in]  b     Pointer of the input array
+ * @param      c     Pointer of the output array
+ * @param[in]  n     Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_vecMvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size = 1024);
+
+/**
+ * @brief      Element-wise division between two CUDA arries.
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  a     Pointer of the input array
+ * @param[in]  b     Pointer of the input array
+ * @param      c     Pointer of the output array
+ * @param[in]  n     Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_vecDvecC_element_wise(const cuComplex *a, const cuComplex *b, cuComplex *c, int n, int bk_size = 1024);
+
+/**
+ * @brief      Element-wise division between two CUDA arries.
+ * 
+ * @note       This is a device side function. All memories must be allocated on the GPU device.
+ *
+ * @param[in]  a     Pointer of the input array
+ * @param[in]  b     Pointer of the input array
+ * @param      c     Pointer of the output array
+ * @param[in]  n     Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_vecDvecZ_element_wise(const cuDoubleComplex *a, const cuDoubleComplex *b, cuDoubleComplex *c, int n, int bk_size = 1024);
+
+/**
+ * @brief      Return complex conjugates of an input CUDA complex array
+ * 
+ * @param a    Pointer of the input arra
+ * @param ca   Pointer of the output array
+ * @param n    Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_vecC_conjugate(const cuComplex *a, cuComplex *ca, int n, int bk_size = 1024);
+
+/**
+ * @brief      Return complex conjugates of an input CUDA complex array
+ * 
+ * @param a    Pointer of the input arra
+ * @param ca   Pointer of the output array
+ * @param n    Length of the arraies
+ * @param[in]  bk_size Default CUDA block size.
+ */
+void clcg_vecZ_conjugate(const cuDoubleComplex *a, cuDoubleComplex *ca, int n, int bk_size = 1024);
+
+#endif // LibLCG_CUDA
+
+#endif // _LCG_COMPLEX_CUDA_H
--- a/src/lib/lcg_cuda.cu
+++ b/src/lib/lcg_cuda.cu
@@ -0,0 +1,685 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "cmath"
+#include "ctime"
+#include "iostream"
+
+#include "lcg_cuda.h"
+
+
+typedef int (*lcg_solver_cuda_ptr)(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+    const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int lcg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, 
+    const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int lcgs(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, 
+    const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+
+int lcg_solver_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, 
+    const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id)
+{
+	lcg_solver_cuda_ptr cg_solver_cuda;
+	switch (solver_id)
+	{
+		case LCG_CG:
+			cg_solver_cuda = lcg;
+			break;
+		case LCG_CGS:
+			cg_solver_cuda = lcgs;
+			break;
+		default:
+			cg_solver_cuda = lcg;
+            break;
+	}
+
+	return cg_solver_cuda(Afp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+
+int lpcg(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+	const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int lcg_solver_preconditioned_cuda(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, 
+    lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id)
+{
+	return lpcg(Afp, Mfp, Pfp, m, B, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+
+int lpg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+	const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, 
+	void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+int lcg_solver_constrained_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+    const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id)
+{
+	return lpg(Afp, Pfp, m, B, low, hig, n_size, nz_size, param, instance, cub_handle, cus_handle);
+}
+
+
+int lcg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
+    const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	// set CG parameters
+	lcg_para para = (param != nullptr) ? (*param) : defparam;
+
+	//check parameters
+	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return LCG_INVALID_POINTER;
+	if (B == nullptr) return LCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	// locate memory
+	lcg_float *d_m = nullptr, *d_B = nullptr;
+	lcg_float *gk = nullptr, *dk = nullptr, *Adk = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
+    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
+	cudaMalloc(&gk, n_size * sizeof(lcg_float));
+    cudaMalloc(&dk, n_size * sizeof(lcg_float));
+    cudaMalloc(&Adk, n_size * sizeof(lcg_float));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+
+	cusparseDnVecDescr_t dvec_m, dvec_dk, dvec_Adk;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_Adk, n_size, Adk, CUDA_R_64F);
+
+    lcg_float none = -1.0;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Adk, n_size, nz_size);
+
+    // g0 = Ax - B
+    cudaMemcpy(gk, Adk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // g0 = A*x
+    cublasDaxpy_v2(cub_handle, n_size, &none, d_B, 1, gk, 1); // g0 -= B
+    cudaMemset(dk, 0, n_size * sizeof(lcg_float)); // d0 = 0
+    cublasDaxpy_v2(cub_handle, n_size, &none, gk, 1, dk, 1); // d0 = -g0
+
+	lcg_float gk_mod;
+    cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk_mod); // gk_mod = ||gk||
+
+	lcg_float g0_mod = gk_mod;
+	if (g0_mod < 1.0) g0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(gk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, sqrt(gk_mod)/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+	else if (gk_mod/g0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, gk_mod/g0_mod, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float dTAd, ak, betak, gk1_mod, residual;
+	while (1)
+	{
+		if (para.abs_diff) residual = sqrt(gk_mod)/n_size;
+		else residual = gk_mod/g0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = LCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = LCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Adk, n_size, nz_size);
+
+        cublasDdot_v2(cub_handle, n_size, dk, 1, Adk, 1, &dTAd); // dTAd = dk^T * Adk
+		ak = gk_mod/dTAd;
+
+        cublasDaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1); // m += ak*dk
+        cublasDaxpy_v2(cub_handle, n_size, &ak, Adk, 1, gk, 1); // gk += ak*Adk
+
+        cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk1_mod); // gk1_mod = ||gk||
+		betak = gk1_mod/gk_mod;
+		gk_mod = gk1_mod;
+
+        cublasDscal_v2(cub_handle, n_size, &betak, dk, 1); // dk *= betak
+        cublasDaxpy_v2(cub_handle, n_size, &none, gk, 1, dk, 1); // dk -= gk
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+        cudaFree(dk);
+        cudaFree(gk);
+        cudaFree(Adk);
+		cusparseDestroyDnVec(dvec_m);
+		cusparseDestroyDnVec(dvec_dk);
+		cusparseDestroyDnVec(dvec_Adk);
+	}
+
+	return ret;
+}
+
+int lcgs(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, const int n_size, 
+    const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	// set CG parameters
+	lcg_para para = (param != nullptr) ? (*param) : defparam;
+
+	//check parameters
+	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return LCG_INVALID_POINTER;
+	if (B == nullptr) return LCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	// locate memory
+	lcg_float *d_m = nullptr, *d_B = nullptr;
+	lcg_float *rk = nullptr, *r0T = nullptr, *pk = nullptr, *qpk = nullptr;
+	lcg_float *Ax = nullptr, *uk = nullptr,   *qk = nullptr, *wk = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
+    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
+	cudaMalloc(&rk, n_size * sizeof(lcg_float));
+    cudaMalloc(&r0T, n_size * sizeof(lcg_float));
+    cudaMalloc(&pk, n_size * sizeof(lcg_float));
+	cudaMalloc(&qpk, n_size * sizeof(lcg_float));
+	cudaMalloc(&Ax, n_size * sizeof(lcg_float));
+	cudaMalloc(&uk, n_size * sizeof(lcg_float));
+	cudaMalloc(&qk, n_size * sizeof(lcg_float));
+	cudaMalloc(&wk, n_size * sizeof(lcg_float));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+
+	cusparseDnVecDescr_t dvec_m, dvec_wk, dvec_pk, dvec_Ax;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_wk, n_size, wk, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_pk, n_size, pk, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_Ax, n_size, Ax, CUDA_R_64F);
+
+	lcg_float one = 1.0;
+    lcg_float none = -1.0;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Ax, n_size, nz_size);
+
+    // r0 = B - Ax
+	cudaMemcpy(rk, d_B, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // r0 = B
+	cublasDaxpy_v2(cub_handle, n_size, &none, Ax, 1, rk, 1); // r0 -= Ax
+    // p0 = u0 = r0T = r0
+	cudaMemcpy(pk, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+	cudaMemcpy(uk, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+	cudaMemcpy(r0T, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+
+	lcg_float rkr0T;
+	cublasDdot_v2(cub_handle, n_size, rk, 1, r0T, 1, &rkr0T);
+
+	lcg_float rk_mod;
+    cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod); // rk_mod = ||rk||
+
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, sqrt(rk_mod)/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/r0_mod, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float ak, nak, rkr0T1, AprT, betak, residual;
+	while (1)
+	{
+		if (para.abs_diff) residual = sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = LCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = LCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_pk, dvec_Ax, n_size, nz_size);
+
+		AprT = 0.0;
+		cublasDdot_v2(cub_handle, n_size, r0T, 1, Ax, 1, &AprT);
+		ak = rkr0T/AprT;
+		nak = -1.0*ak;
+
+		cudaMemcpy(qk, uk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cudaMemcpy(wk, uk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+        cublasDaxpy_v2(cub_handle, n_size, &nak, Ax, 1, qk, 1);
+        cublasDaxpy_v2(cub_handle, n_size, &one, qk, 1, wk, 1);
+
+		Afp(instance, cub_handle, cus_handle, dvec_wk, dvec_Ax, n_size, nz_size);
+
+		cublasDaxpy_v2(cub_handle, n_size, &ak, wk, 1, d_m, 1);
+        cublasDaxpy_v2(cub_handle, n_size, &nak, Ax, 1, rk, 1);
+
+        cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod);
+		
+		cublasDdot_v2(cub_handle, n_size, rk, 1, r0T, 1, &rkr0T1);
+		betak = rkr0T1/rkr0T;
+		rkr0T = rkr0T1;
+
+		cudaMemcpy(uk, rk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cublasDaxpy_v2(cub_handle, n_size, &betak, qk, 1, uk, 1);
+
+		cudaMemcpy(qpk, qk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cublasDaxpy_v2(cub_handle, n_size, &betak, pk, 1, qpk, 1);
+
+		cudaMemcpy(pk, uk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cublasDaxpy_v2(cub_handle, n_size, &betak, qpk, 1, pk, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+        cudaFree(rk);
+		cudaFree(r0T);
+		cudaFree(pk);
+		cudaFree(qpk);
+		cudaFree(Ax);
+		cudaFree(uk);
+		cudaFree(qk);
+		cudaFree(wk);
+		cusparseDestroyDnVec(dvec_m);
+		cusparseDestroyDnVec(dvec_wk);
+		cusparseDestroyDnVec(dvec_pk);
+		cusparseDestroyDnVec(dvec_Ax);
+	}
+
+	return ret;
+}
+
+int lpcg(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+	const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	// set CG parameters
+	lcg_para para = (param != nullptr) ? (*param) : defparam;
+
+	//check parameters
+	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
+
+	if (m == nullptr) return LCG_INVALID_POINTER;
+	if (B == nullptr) return LCG_INVALID_POINTER;
+    if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	// locate memory
+	lcg_float *d_m = nullptr, *d_B = nullptr;
+	lcg_float *rk = nullptr, *zk = nullptr, *dk = nullptr, *Adk = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
+    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
+	cudaMalloc(&rk, n_size * sizeof(lcg_float));
+    cudaMalloc(&zk, n_size * sizeof(lcg_float));
+    cudaMalloc(&dk, n_size * sizeof(lcg_float));
+	cudaMalloc(&Adk, n_size * sizeof(lcg_float));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+
+	cusparseDnVecDescr_t dvec_m, dvec_rk, dvec_zk, dvec_dk, dvec_Adk;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_rk, n_size, rk, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_zk, n_size, zk, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_dk, n_size, dk, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_Adk, n_size, Adk, CUDA_R_64F);
+
+	lcg_float one = 1.0;
+    lcg_float none = -1.0;
+
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Adk, n_size, nz_size);
+
+    // r0 = B - Ax
+	cudaMemcpy(rk, d_B, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // r0 = B
+	cublasDaxpy_v2(cub_handle, n_size, &none, Adk, 1, rk, 1); // r0 -= Ax
+
+	Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_zk, n_size, nz_size);
+
+    // d0 = z0
+	cudaMemcpy(dk, zk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+
+	lcg_float rk_mod;
+    cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod); // rk_mod = ||rk||
+
+	lcg_float r0_mod = rk_mod;
+	if (r0_mod < 1.0) r0_mod = 1.0;
+
+	lcg_float zTr;
+	cublasDdot_v2(cub_handle, n_size, zk, 1, rk, 1, &zTr);
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(rk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, sqrt(rk_mod)/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+	else if (rk_mod/r0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, rk_mod/r0_mod, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+	lcg_float dTAd, ak, nak, betak, zTr1, residual;
+	while (1)
+	{
+		if (para.abs_diff) residual = sqrt(rk_mod)/n_size;
+		else residual = rk_mod/r0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = LCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = LCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+        Afp(instance, cub_handle, cus_handle, dvec_dk, dvec_Adk, n_size, nz_size);
+
+		cublasDdot_v2(cub_handle, n_size, dk, 1, Adk, 1, &dTAd);
+		ak = zTr/dTAd;
+		nak = -1.0*ak;
+
+        cublasDaxpy_v2(cub_handle, n_size, &ak, dk, 1, d_m, 1);
+        cublasDaxpy_v2(cub_handle, n_size, &nak, Adk, 1, rk, 1);
+
+		Mfp(instance, cub_handle, cus_handle, dvec_rk, dvec_zk, n_size, nz_size);
+
+        cublasDdot_v2(cub_handle, n_size, rk, 1, rk, 1, &rk_mod);
+		
+		cublasDdot_v2(cub_handle, n_size, zk, 1, rk, 1, &zTr1);
+		betak = zTr1/zTr;
+		zTr = zTr1;
+
+		cublasDscal_v2(cub_handle, n_size, &betak, dk, 1); // dk *= betak
+		cublasDaxpy_v2(cub_handle, n_size, &one, zk, 1, dk, 1);
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+        cudaFree(rk);
+		cudaFree(zk);
+		cudaFree(dk);
+		cudaFree(Adk);
+		cusparseDestroyDnVec(dvec_m);
+		cusparseDestroyDnVec(dvec_rk);
+		cusparseDestroyDnVec(dvec_zk);
+		cusparseDestroyDnVec(dvec_dk);
+		cusparseDestroyDnVec(dvec_Adk);
+	}
+
+	return ret;
+}
+
+
+int lpg(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+	const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, 
+	void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	// set CG parameters
+	lcg_para para = (param != nullptr) ? (*param) : defparam;
+
+	// check parameters
+	if (n_size <= 0) return LCG_INVILAD_VARIABLE_SIZE;
+	if (para.max_iterations < 0) return LCG_INVILAD_MAX_ITERATIONS;
+	if (para.epsilon <= 0.0 || para.epsilon >= 1.0) return LCG_INVILAD_EPSILON;
+	if (para.step <= 0.0) return LCG_INVALID_LAMBDA;
+
+	if (m == nullptr) return LCG_INVALID_POINTER;
+	if (B == nullptr) return LCG_INVALID_POINTER;
+	if (low == nullptr) return LCG_INVALID_POINTER;
+	if (hig == nullptr) return LCG_INVALID_POINTER;
+	if (cub_handle == nullptr) return LCG_INVALID_POINTER;
+    if (cus_handle == nullptr) return LCG_INVALID_POINTER;
+
+	// locate memory
+	lcg_float *d_m = nullptr, *d_B = nullptr;
+	lcg_float *gk = nullptr, *Adk = nullptr;
+	lcg_float *m_new = nullptr, *gk_new = nullptr;
+	lcg_float *sk = nullptr, *yk = nullptr;
+	cudaMalloc(&d_m, n_size * sizeof(lcg_float));
+    cudaMalloc(&d_B, n_size * sizeof(lcg_float));
+	cudaMalloc(&gk, n_size *sizeof(lcg_float));
+	cudaMalloc(&Adk, n_size *sizeof(lcg_float));
+	cudaMalloc(&m_new, n_size *sizeof(lcg_float));
+	cudaMalloc(&gk_new, n_size *sizeof(lcg_float));
+	cudaMalloc(&sk, n_size *sizeof(lcg_float));
+	cudaMalloc(&yk, n_size *sizeof(lcg_float));
+
+	// Copy initial solutions
+	cudaMemcpy(d_m, m, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, B, n_size * sizeof(lcg_float), cudaMemcpyHostToDevice);
+
+	cusparseDnVecDescr_t dvec_m, dvec_mnew, dvec_Adk;
+	cusparseCreateDnVec(&dvec_m, n_size, d_m, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_mnew, n_size, m_new, CUDA_R_64F);
+	cusparseCreateDnVec(&dvec_Adk, n_size, Adk, CUDA_R_64F);
+
+	lcg_float none = -1.0;
+	lcg_float nalpha_k, alpha_k = para.step;
+
+	lcg_set2box_cuda(low, hig, m, n_size);
+	Afp(instance, cub_handle, cus_handle, dvec_m, dvec_Adk, n_size, nz_size);
+
+	// g0 = Ax - B
+    cudaMemcpy(gk, Adk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // g0 = A*x
+    cublasDaxpy_v2(cub_handle, n_size, &none, d_B, 1, gk, 1); // g0 -= B
+
+	lcg_float gk_mod;
+    cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk_mod); // gk_mod = ||gk||
+
+	lcg_float g0_mod = gk_mod;
+	if (g0_mod < 1.0) g0_mod = 1.0;
+
+	int ret, t = 0;
+	if (para.abs_diff && sqrt(gk_mod)/n_size <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, sqrt(gk_mod)/n_size, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+	else if (gk_mod/g0_mod <= para.epsilon)
+	{
+		ret = LCG_ALREADY_OPTIMIZIED;
+		if (Pfp != nullptr)
+		{
+			Pfp(instance, d_m, gk_mod/g0_mod, &para, n_size, nz_size, 0);
+		}
+		goto func_ends;
+	}
+
+
+	lcg_float sk_mod, syk_mod, residual;
+	while(1)
+	{
+		if (para.abs_diff) residual = sqrt(gk_mod)/n_size;
+		else residual = gk_mod/g0_mod;
+
+		if (Pfp != nullptr)
+		{
+			if (Pfp(instance, d_m, residual, &para, n_size, nz_size, t))
+			{
+				ret = LCG_STOP; goto func_ends;
+			}
+		}
+
+		if (residual <= para.epsilon)
+		{
+			ret = LCG_CONVERGENCE; goto func_ends;
+		}
+
+		if (para.max_iterations > 0 && t+1 > para.max_iterations)
+		{
+			ret = LCG_REACHED_MAX_ITERATIONS;
+			break;
+		}
+		
+		t++;
+
+		nalpha_k = -1.0*alpha_k;
+		cudaMemcpy(m_new, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cublasDaxpy_v2(cub_handle, n_size, &nalpha_k, gk, 1, m_new, 1);
+
+		lcg_set2box_cuda(low, hig, m_new, n_size);
+		Afp(instance, cub_handle, cus_handle, dvec_mnew, dvec_Adk, n_size, nz_size);
+		
+		cudaMemcpy(gk_new, Adk, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice); // g0 = A*x
+    	cublasDaxpy_v2(cub_handle, n_size, &none, d_B, 1, gk, 1); // g0 -= B
+
+		cudaMemcpy(sk, m_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cublasDaxpy_v2(cub_handle, n_size, &none, d_m, 1, sk, 1);
+
+		cudaMemcpy(yk, gk_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cublasDaxpy_v2(cub_handle, n_size, &none, gk, 1, sk, 1);
+
+		cublasDdot_v2(cub_handle, n_size, sk, 1, sk, 1, &sk_mod);
+		cublasDdot_v2(cub_handle, n_size, sk, 1, yk, 1, &syk_mod);
+		alpha_k = sk_mod/syk_mod;
+
+		cudaMemcpy(d_m, m_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+		cudaMemcpy(gk, gk_new, n_size * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+
+		lcg_float gk_mod;
+		cublasDdot_v2(cub_handle, n_size, gk, 1, gk, 1, &gk_mod); // gk_mod = ||gk||
+	}
+
+	func_ends:
+	{
+		// Copy to host memories
+		cudaMemcpy(m, d_m, n_size * sizeof(lcg_float), cudaMemcpyDeviceToHost);
+
+		cudaFree(d_m);
+		cudaFree(d_B);
+		cudaFree(gk);
+		cudaFree(gk_new);
+		cudaFree(m_new);
+		cudaFree(sk);
+		cudaFree(yk);
+		cudaFree(Adk);
+		cusparseDestroyDnVec(dvec_m);
+		cusparseDestroyDnVec(dvec_mnew);
+		cusparseDestroyDnVec(dvec_Adk);
+	}
+
+	return ret;
+}
--- a/src/lib/lcg_cuda.h
+++ b/src/lib/lcg_cuda.h
@@ -0,0 +1,135 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _LCG_CUDA_H
+#define _LCG_CUDA_H
+
+#include "util.h"
+#include "algebra_cuda.h"
+
+#ifdef LibLCG_CUDA
+
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+
+/**
+ * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
+ * by a vertical vector 'x'. Note that both A and x are hosted on the GPU device.
+ * 
+ * @param  instance    The user data sent for the lcg_solver_cuda() functions by the client.
+ * @param  cub_handle  Handler of the cublas object.
+ * @param  cus_handle  Handlee of the cusparse object.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ * @param  n_size      Size of x and column/row numbers of A.
+ */
+typedef void (*lcg_axfunc_cuda_ptr)(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary. Note that m is hosted on the GPU device.
+ * 
+ * @param    instance    The user data sent for the lcg_solver() functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    n_size      The size of the variables
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*lcg_progress_cuda_ptr)(void* instance, const lcg_float* m, const lcg_float converge, 
+	const lcg_para* param, const int n_size, const int nz_size, const int k);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+    const int n_size, const int nz_size, const lcg_para* param, void* instance, cublasHandle_t cub_handle, 
+    cusparseHandle_t cus_handle, lcg_solver_enum solver_id = LCG_CG);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param[in]  nz_size     Size of the non-zero element of a cusparse object.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_preconditioned_cuda(lcg_axfunc_cuda_ptr Afp, lcg_axfunc_cuda_ptr Mfp, lcg_progress_cuda_ptr Pfp, 
+    lcg_float* m, const lcg_float* B, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id = LCG_PCG);
+
+/**
+ * @brief      A combined conjugate gradient solver function. Note that both m and B are hosted on the GPU device.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'Mx' for preconditioning.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      low         Lower bound of the acceptable solution.
+ * @param      hig         Higher bound of the acceptable solution.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  n_size      Size of the solution vector and objective vector.
+ * @param[in]  nz_size     Size of the non-zero element of a cusparse object.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * @param      cub_handle  Handler of the cublas object.
+ * @param      cus_handle  Handlee of the cusparse object.
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_constrained_cuda(lcg_axfunc_cuda_ptr Afp, lcg_progress_cuda_ptr Pfp, lcg_float* m, const lcg_float* B, 
+    const lcg_float* low, const lcg_float* hig, const int n_size, const int nz_size, const lcg_para* param, void* instance, 
+    cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_solver_enum solver_id = LCG_PG);
+
+#endif // LibLCG_CUDA
+
+#endif // _LCG_CUDA_H
--- a/src/lib/lcg_eigen.cpp
+++ b/src/lib/lcg_eigen.cpp
--- a/src/lib/lcg_eigen.h
+++ b/src/lib/lcg_eigen.h
@@ -0,0 +1,110 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _LCG_EIGEN_H
+#define _LCG_EIGEN_H
+
+#include "util.h"
+#include "algebra_eigen.h"
+
+/**
+ * @brief  Callback interface for calculating the product of a N*N matrix 'A' multiplied 
+ * by a vertical vector 'x'.
+ * 
+ * @param  instance    The user data sent for the lcg_solver() functions by the client.
+ * @param  x           Multiplier of the Ax product.
+ * @param  Ax          Product of A multiplied by x.
+ */
+typedef void (*lcg_axfunc_eigen_ptr)(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax);
+
+/**
+ * @brief     Callback interface for monitoring the progress and terminate the iteration 
+ * if necessary.
+ * 
+ * @param    instance    The user data sent for the lcg_solver() functions by the client.
+ * @param    m           The current solutions.
+ * @param    converge    The current value evaluating the iteration progress.
+ * @param    k           The iteration count.
+ * 
+ * @retval   int         Zero to continue the optimization process. Returning a
+ *                       non-zero value will terminate the optimization process.
+ */
+typedef int (*lcg_progress_eigen_ptr)(void* instance, const Eigen::VectorXd *m, const lcg_float converge, 
+	const lcg_para *param, const int k);
+
+/**
+ * @brief      A combined conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_eigen(lcg_axfunc_eigen_ptr Afp, lcg_progress_eigen_ptr Pfp, Eigen::VectorXd &m, 
+	const Eigen::VectorXd &B, const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_CG);
+
+/**
+ * @brief      A combined conjugate gradient solver function.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Mfp         Callback function for calculating the product of 'M^{-1}x', in which M is the preconditioning matrix.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_PCG.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_preconditioned_eigen(lcg_axfunc_eigen_ptr Afp, lcg_axfunc_eigen_ptr Mfp, lcg_progress_eigen_ptr Pfp, 
+	Eigen::VectorXd &m, const Eigen::VectorXd &B, const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_PCG);
+
+/**
+ * @brief      A combined conjugate gradient solver function with inequality constraints.
+ *
+ * @param[in]  Afp         Callback function for calculating the product of 'Ax'.
+ * @param[in]  Pfp         Callback function for monitoring the iteration progress.
+ * @param      m           Initial solution vector.
+ * @param      B           Objective vector of the linear system.
+ * @param[in]  low         The lower boundary of the acceptable solution.
+ * @param[in]  hig         The higher boundary of the acceptable solution.
+ * @param      param       Parameter setup for the conjugate gradient methods.
+ * @param      instance    The user data sent for the lcg_solver() function by the client. 
+ * This variable is either 'this' for class member functions or 'NULL' for global functions.
+ * @param      solver_id   Solver type used to solve the linear system. The default value is LCG_CGS.
+ * @param      P           Precondition vector (optional expect for the LCG_PCG method). The default value is NULL.
+ *
+ * @return     Status of the function.
+ */
+int lcg_solver_constrained_eigen(lcg_axfunc_eigen_ptr Afp, lcg_progress_eigen_ptr Pfp, Eigen::VectorXd &m, 
+	const Eigen::VectorXd &B, const Eigen::VectorXd &low, const Eigen::VectorXd &hig, 
+	const lcg_para* param, void* instance, lcg_solver_enum solver_id = LCG_PG);
+
+#endif //_LCG_EIGEN_H
--- a/src/lib/preconditioner.cpp
+++ b/src/lib/preconditioner.cpp
@@ -0,0 +1,381 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "preconditioner.h"
+
+#include "cmath"
+#include "map"
+
+void lcg_incomplete_Cholesky_half_buffsize_coo(const int *row, const int *col, int nz_size, int *lnz_size)
+{
+    size_t c = 0;
+    for (size_t i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            c++;
+        }
+    }
+    *lnz_size = c;
+    return;
+}
+
+void lcg_incomplete_Cholesky_half_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, 
+    int lnz_size, int *IC_row, int *IC_col, lcg_float *IC_val)
+{
+    // We use this to store diagonal elements of the factorizated lower triangular matrix
+    lcg_float *diagonal = new lcg_float [N];
+    // A temporary row
+    lcg_float *tmp_row = new lcg_float [N];
+    // index of non-zero elements in tmp_row
+    int *filled_idx = new int [N];
+    // Begining index of each row in the input matrix
+    int *row_st_idx = new int [N];
+
+    size_t i, j, f;
+
+    // Set initial values
+    for (i = 0; i < N; i++)
+    {
+        diagonal[i] = 0.0;
+        tmp_row[i] = 0.0;
+        filled_idx[i] = -1;
+        row_st_idx[i] = -1;
+    }
+
+    // copy elements in the lower triangle to the output matrix
+    j = 0;
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            IC_row[j] = row[i];
+            IC_col[j] = col[i];
+            IC_val[j] = val[i];
+            j++;
+        }
+    }
+
+    // Get the begining index of each row in the matrix
+    j = 1;
+    row_st_idx[0] = IC_row[0];
+    size_t old_row = IC_row[0];
+    for (i = 1; i < lnz_size; i++)
+    {
+        if (IC_row[i] > old_row)
+        {
+            row_st_idx[j] = i;
+            old_row = IC_row[i];
+            j++;
+        }
+    }
+
+    // Calculate the first element
+    IC_val[0] = sqrt(IC_val[0]);
+    diagonal[0] = IC_val[0];
+
+    lcg_float dia_sum;
+    dia_sum = 0.0;
+    // The first one is already calculated
+    for (i = 1; i < lnz_size; i++)
+    {
+        // Calculate the first column if there is one
+        if (IC_col[i] == 0)
+        {
+            IC_val[i] = IC_val[i]/IC_val[0];
+            dia_sum = dia_sum + IC_val[i]*IC_val[i];
+            continue; // Case 1 break
+        }
+        
+        // Calculate elements in the middle of a row
+        if (IC_row[i] > IC_col[i])
+        {
+            // Find needed values from previous elements
+            f = 0;
+            j = row_st_idx[IC_col[i]];
+            while (IC_col[j] < IC_col[i])
+            {
+                tmp_row[IC_col[j]] = IC_val[j];
+                filled_idx[f]  = IC_col[j];
+                f++;
+                j++;
+            }
+
+            j = row_st_idx[IC_row[i]];
+            while (IC_col[j] < IC_col[i])
+            {
+                IC_val[i] = IC_val[i] - IC_val[j]*tmp_row[IC_col[j]];
+                j++;
+            }
+            
+            IC_val[i] = IC_val[i]/diagonal[IC_col[i]];
+            dia_sum = dia_sum + IC_val[i]*IC_val[i];
+
+            // reset tmp variables
+            for (j = 0; j < f; j++)
+            {
+                tmp_row[filled_idx[j]] = 0.0;
+            }
+
+            continue; // Case 2 break
+        }
+        
+        // We have rearched the diagonal position
+        if (IC_row[i] == IC_col[i])
+        {
+            IC_val[i] = sqrt(IC_val[i] - dia_sum);
+            diagonal[IC_col[i]] = IC_val[i];
+            dia_sum = 0.0;
+        }
+    }
+
+    delete[] diagonal;
+    delete[] tmp_row;
+    delete[] row_st_idx;
+    delete[] filled_idx;
+    return;
+}
+
+void lcg_incomplete_Cholesky_full_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, int *IC_row, int *IC_col, lcg_float *IC_val)
+{
+    // We use this to store diagonal elements of the factorizated lower triangular matrix
+    lcg_float *diagonal = new lcg_float [N];
+    // A temporary row
+    lcg_float *tmp_row = new lcg_float [N];
+    // index of non-zero elements in tmp_row
+    int *filled_idx = new int [N];
+    // Begining index of each row in the input matrix
+    int *row_st_idx = new int [N];
+
+    size_t i, j, f, l;
+
+    // Set initial values
+    for (i = 0; i < N; i++)
+    {
+        diagonal[i] = 0.0;
+        tmp_row[i] = 0.0;
+        filled_idx[i] = -1;
+        row_st_idx[i] = -1;
+    }
+
+    // copy elements to the output matrix
+    for (i = 0; i < nz_size; i++)
+    {
+        IC_row[i] = row[i];
+        IC_col[i] = col[i];
+        IC_val[i] = val[i];
+    }
+
+    // count element number in the lower triangular part (including the diagonal) and the upper triangular part (excluding the diagonal)
+    // build map from elements' cooridnate to their index in the array
+    size_t order, L_nz = 0;
+    std::map<size_t, size_t> index_map;
+
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i]) // Count number for thr lower triangular part
+        {
+            L_nz++;
+        }
+        else // Only need to build the map for the upper triangular part
+        {
+            order = N*row[i] + col[i];
+            index_map[order] = i;
+        }
+    }
+
+    // We use to store element index in the lower triangle
+    j = 0;
+    size_t *low_idx = new size_t [L_nz];
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            low_idx[j] = i;
+            j++;
+        }
+    }
+
+    // Get the begining index of each row in the matrix
+    j = 1;
+    row_st_idx[0] = IC_row[0];
+    size_t old_row = IC_row[0];
+    for (i = 1; i < nz_size; i++)
+    {
+        if (IC_row[i] > old_row)
+        {
+            row_st_idx[j] = i;
+            old_row = IC_row[i];
+            j++;
+        }
+    }
+
+    // Calculate the first element
+    IC_val[0] = sqrt(IC_val[0]);
+    diagonal[0] = IC_val[0];
+
+    lcg_float dia_sum;
+    dia_sum = 0.0;
+    // The first one is already calculated
+    for (i = 1; i < L_nz; i++)
+    {
+        l = low_idx[i];
+
+        // Calculate the first column if there is one
+        if (IC_col[l] == 0)
+        {
+            IC_val[l] = IC_val[l]/IC_val[0];
+            dia_sum = dia_sum + IC_val[l]*IC_val[l];
+            // Set value at the upper triangle
+            order = IC_row[l];
+            IC_val[index_map[order]] = IC_val[l];
+            continue; // Case 1 break
+        }
+        
+        // Calculate elements in the middle of a row
+        if (IC_row[l] > IC_col[l])
+        {
+            // Find needed values from previous elements
+            f = 0;
+            j = row_st_idx[IC_col[l]];
+            while (IC_col[j] < IC_col[l])
+            {
+                tmp_row[IC_col[j]] = IC_val[j];
+                filled_idx[f]  = IC_col[j];
+                f++;
+                j++;
+            }
+
+            j = row_st_idx[IC_row[l]];
+            while (IC_col[j] < IC_col[l])
+            {
+                IC_val[l] = IC_val[l] - IC_val[j]*tmp_row[IC_col[j]];
+                j++;
+            }
+            
+            IC_val[l] = IC_val[l]/diagonal[IC_col[l]];
+            dia_sum = dia_sum + IC_val[l]*IC_val[l];
+
+            // Set value at the upper triangle
+            order = N*IC_col[l] + IC_row[l];
+            IC_val[index_map[order]] = IC_val[l];
+
+            // reset tmp variables
+            for (j = 0; j < f; j++)
+            {
+                tmp_row[filled_idx[j]] = 0.0;
+            }
+
+            continue; // Case 2 break
+        }
+        
+        // We have rearched the diagonal position
+        if (IC_row[l] == IC_col[l])
+        {
+            IC_val[l] = sqrt(IC_val[l] - dia_sum);
+            diagonal[IC_col[l]] = IC_val[l];
+            dia_sum = 0.0;
+        }
+    }
+
+    delete[] diagonal;
+    delete[] tmp_row;
+    delete[] row_st_idx;
+    delete[] filled_idx;
+    delete[] low_idx;
+    index_map.clear();
+    return;
+}
+
+void lcg_solve_upper_triangle_coo(const int *row, const int *col, const lcg_float *U, const lcg_float *B, lcg_float *x, int N, int nz_size)
+{
+    for (size_t i = 0; i < N; i++)
+    {
+        x[i] = 0.0;
+    }
+    
+    size_t iter = nz_size - 1;
+    double sum;
+    for (size_t i = N-1; i >= 0; i--)
+    {
+        sum = 0.0;
+        for (size_t j = iter; j >= 0; j--)
+        {
+            if (row[j] == i && col[j] > i)
+            {
+                sum += U[j] * x[col[j]];
+            }
+            else if (row[j] == i && col[j] == i)
+            {
+                x[i] = (B[i] - sum)/U[j];
+                if (j == 0) return;
+                else iter = j-1;
+                break;
+            }
+        }
+    }
+    return;
+}
+
+void lcg_solve_lower_triangle_coo(const int *row, const int *col, const lcg_float *L, const lcg_float *B, lcg_float *x, int N, int nz_size)
+{
+    for (size_t i = 0; i < N; i++)
+    {
+        x[i] = 0.0;
+    }
+    
+    size_t iter = 0;
+    double sum;
+    for (size_t i = 0; i < N; i++)
+    {
+        sum = 0.0;
+        for (size_t j = iter; j < nz_size; j++)
+        {
+            if (row[j] == i && col[j] < i)
+            {
+                sum += L[j] * x[col[j]];
+            }
+            else if (row[j] == i && col[j] == i)
+            {
+                x[i] = (B[i] - sum)/L[j];
+                iter = j+1;
+                break;
+            }
+        }
+    }
+    return;
+}
+
+bool lcg_full_rank_coo(const int *row, const int *col, const lcg_float *M, int N, int nz_size)
+{
+    size_t s = 0;
+    for (size_t i = 0; i < nz_size; i++)
+    {
+        if (row[i] == col[i] && M[i] != 0.0)
+        {
+            s++;
+        }
+    }
+    
+    if (s == N) return true;
+    else return false;
+}
--- a/src/lib/preconditioner.h
+++ b/src/lib/preconditioner.h
@@ -0,0 +1,110 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _PRECONDITIONER_H
+#define _PRECONDITIONER_H
+
+#include "algebra.h"
+
+/**
+ * @brief Return the number of non-zero elements in the lower triangular part of the input matrix
+ * 
+ * @param row[in]        Row index of the input sparse matrix.
+ * @param col[in]        Column index of the input sparse matrix.
+ * @param nz_size[in]    Length of the non-zero elements.
+ * @param lnz_size[out]  Legnth of the non-zero elements in the lower triangle
+ */
+void lcg_incomplete_Cholesky_half_buffsize_coo(const int *row, const int *col, int nz_size, int *lnz_size);
+
+/**
+ * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
+ * 
+ * @note  Only the factorized lower triangular matrix is stored in the lower part of the output matrix accordingly.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param val        Non-zero values of the input sparse matrix.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zero elements.
+ * @param lnz_size   Legnth of the non-zero elements in the lower triangle
+ * @param IC_row     Row index of the factorized triangular sparse matrix.
+ * @param IC_col     Column index of the factorized triangular sparse matrix.
+ * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
+ */
+void lcg_incomplete_Cholesky_half_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, int lnz_size, int *IC_row, int *IC_col, lcg_float *IC_val);
+
+/**
+ * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
+ * 
+ * @note  The factorized lower and upper triangular matrixes are stored in the lower and upper triangular parts of the output matrix accordingly.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param val        Non-zero values of the input sparse matrix.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zeor elements.
+ * @param IC_row     Row index of the factorized triangular sparse matrix.
+ * @param IC_col     Column index of the factorized triangular sparse matrix.
+ * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
+ */
+void lcg_incomplete_Cholesky_full_coo(const int *row, const int *col, const lcg_float *val, int N, int nz_size, int *IC_row, int *IC_col, lcg_float *IC_val);
+
+/**
+ * @brief Solve the linear system Ux = B, in which U is a upper triangle matrix.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param U          Non-zero values of the input sparse matrix.
+ * @param B          Object array.
+ * @param x          The returned solution.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zeor elements.
+ */
+void lcg_solve_upper_triangle_coo(const int *row, const int *col, const lcg_float *U, const lcg_float *B, lcg_float *x, int N, int nz_size);
+
+/**
+ * @brief Solve the linear system Lx = B, in which L is a lower triangle matrix.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param L          Non-zero values of the input sparse matrix.
+ * @param B          Object array.
+ * @param x          The returned solution.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zeor elements.
+ */
+void lcg_solve_lower_triangle_coo(const int *row, const int *col, const lcg_float *L, const lcg_float *B, lcg_float *x, int N, int nz_size);
+
+/**
+ * @brief Check to see if a square matrix is full ranked or not. The sparse matrix is stored in the COO format.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param M          Non-zero values of the input sparse matrix.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zeor elements.
+ * @return true      The matrix is full ranked.
+ * @return false     The matrix is not full ranked.
+ */
+bool lcg_full_rank_coo(const int *row, const int *col, const lcg_float *M, int N, int nz_size);
+
+#endif // _PRECONDITIONER_H
--- a/src/lib/preconditioner_cuda.cu
+++ b/src/lib/preconditioner_cuda.cu
@@ -0,0 +1,421 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "preconditioner_cuda.h"
+#include "map"
+
+void clcg_incomplete_Cholesky_cuda_half_buffsize(const int *row, const int *col, int nz_size, int *lnz_size)
+{
+    size_t c = 0;
+    for (size_t i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            c++;
+        }
+    }
+    *lnz_size = c;
+    return;
+}
+
+void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuComplex *val, int N, int nz_size, 
+    int lnz_size, int *IC_row, int *IC_col, cuComplex *IC_val)
+{
+    // We use this to store diagonal elements of the factorizated lower triangular matrix
+    cuComplex *diagonal = new cuComplex [N];
+    // A temporary row
+    cuComplex *tmp_row = new cuComplex [N];
+    // index of non-zero elements in tmp_row
+    int *filled_idx = new int [N];
+    // Begining index of each row in the input matrix
+    int *row_st_idx = new int [N];
+
+    size_t i, j, f;
+
+    // Set initial values
+    for (i = 0; i < N; i++)
+    {
+        diagonal[i].x = 0.0; diagonal[i].y = 0.0;
+        tmp_row[i].x = 0.0; tmp_row[i].y = 0.0;
+        filled_idx[i] = -1;
+        row_st_idx[i] = -1;
+    }
+
+    // copy elements in the lower triangle to the output matrix
+    j = 0;
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            IC_row[j] = row[i];
+            IC_col[j] = col[i];
+            IC_val[j] = val[i];
+            j++;
+        }
+    }
+
+    // Get the begining index of each row in the matrix
+    j = 1;
+    row_st_idx[0] = IC_row[0];
+    size_t old_row = IC_row[0];
+    for (i = 1; i < lnz_size; i++)
+    {
+        if (IC_row[i] > old_row)
+        {
+            row_st_idx[j] = i;
+            old_row = IC_row[i];
+            j++;
+        }
+    }
+
+    // Calculate the first element
+    IC_val[0] = clcg_Csqrt(IC_val[0]);
+    diagonal[0] = IC_val[0];
+
+    cuComplex dia_sum;
+    dia_sum.x = 0.0; dia_sum.y = 0.0;
+    // The first one is already calculated
+    for (i = 1; i < lnz_size; i++)
+    {
+        // Calculate the first column if there is one
+        if (IC_col[i] == 0)
+        {
+            IC_val[i] = cuCdivf(IC_val[i], IC_val[0]);
+            dia_sum = clcg_Csum(dia_sum, cuCmulf(IC_val[i], IC_val[i]));
+            continue; // Case 1 break
+        }
+        
+        // Calculate elements in the middle of a row
+        if (IC_row[i] > IC_col[i])
+        {
+            // Find needed values from previous elements
+            f = 0;
+            j = row_st_idx[IC_col[i]];
+            while (IC_col[j] < IC_col[i])
+            {
+                tmp_row[IC_col[j]] = IC_val[j];
+                filled_idx[f]  = IC_col[j];
+                f++;
+                j++;
+            }
+
+            j = row_st_idx[IC_row[i]];
+            while (IC_col[j] < IC_col[i])
+            {
+                IC_val[i] = clcg_Cdiff(IC_val[i], cuCmulf(IC_val[j], tmp_row[IC_col[j]]));
+                j++;
+            }
+            
+            IC_val[i] = cuCdivf(IC_val[i], diagonal[IC_col[i]]);
+            dia_sum = clcg_Csum(dia_sum, cuCmulf(IC_val[i], IC_val[i]));
+
+            // reset tmp variables
+            for (j = 0; j < f; j++)
+            {
+                tmp_row[filled_idx[j]].x = 0.0; tmp_row[filled_idx[j]].y = 0.0;
+            }
+
+            continue; // Case 2 break
+        }
+        
+        // We have rearched the diagonal position
+        if (IC_row[i] == IC_col[i])
+        {
+            IC_val[i] = clcg_Csqrt(clcg_Cdiff(IC_val[i], dia_sum));
+            diagonal[IC_col[i]] = IC_val[i];
+            dia_sum.x = 0.0; dia_sum.y = 0.0;
+        }
+    }
+
+    delete[] diagonal;
+    delete[] tmp_row;
+    delete[] row_st_idx;
+    delete[] filled_idx;
+    return;
+}
+
+void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, 
+    int lnz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val)
+{
+    // We use this to store diagonal elements of the factorizated lower triangular matrix
+    cuDoubleComplex *diagonal = new cuDoubleComplex [N];
+    // A temporary row
+    cuDoubleComplex *tmp_row = new cuDoubleComplex [N];
+    // index of non-zero elements in tmp_row
+    int *filled_idx = new int [N];
+    // Begining index of each row in the input matrix
+    int *row_st_idx = new int [N];
+
+    size_t i, j, f;
+
+    // Set initial values
+    for (i = 0; i < N; i++)
+    {
+        diagonal[i].x = 0.0; diagonal[i].y = 0.0;
+        tmp_row[i].x = 0.0; tmp_row[i].y = 0.0;
+        filled_idx[i] = -1;
+        row_st_idx[i] = -1;
+    }
+
+    // copy elements in the lower triangle to the output matrix
+    j = 0;
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            IC_row[j] = row[i];
+            IC_col[j] = col[i];
+            IC_val[j] = val[i];
+            j++;
+        }
+    }
+
+    // Get the begining index of each row in the matrix
+    j = 1;
+    row_st_idx[0] = IC_row[0];
+    size_t old_row = IC_row[0];
+    for (i = 1; i < lnz_size; i++)
+    {
+        if (IC_row[i] > old_row)
+        {
+            row_st_idx[j] = i;
+            old_row = IC_row[i];
+            j++;
+        }
+    }
+
+    // Calculate the first element
+    IC_val[0] = clcg_Zsqrt(IC_val[0]);
+    diagonal[0] = IC_val[0];
+
+    cuDoubleComplex dia_sum;
+    dia_sum.x = 0.0; dia_sum.y = 0.0;
+    // The first one is already calculated
+    for (i = 1; i < lnz_size; i++)
+    {
+        // Calculate the first column if there is one
+        if (IC_col[i] == 0)
+        {
+            IC_val[i] = cuCdiv(IC_val[i], IC_val[0]);
+            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[i], IC_val[i]));
+            continue; // Case 1 break
+        }
+        
+        // Calculate elements in the middle of a row
+        if (IC_row[i] > IC_col[i])
+        {
+            // Find needed values from previous elements
+            f = 0;
+            j = row_st_idx[IC_col[i]];
+            while (IC_col[j] < IC_col[i])
+            {
+                tmp_row[IC_col[j]] = IC_val[j];
+                filled_idx[f]  = IC_col[j];
+                f++;
+                j++;
+            }
+
+            j = row_st_idx[IC_row[i]];
+            while (IC_col[j] < IC_col[i])
+            {
+                IC_val[i] = clcg_Zdiff(IC_val[i], cuCmul(IC_val[j], tmp_row[IC_col[j]]));
+                j++;
+            }
+            
+            IC_val[i] = cuCdiv(IC_val[i], diagonal[IC_col[i]]);
+            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[i], IC_val[i]));
+
+            // reset tmp variables
+            for (j = 0; j < f; j++)
+            {
+                tmp_row[filled_idx[j]].x = 0.0; tmp_row[filled_idx[j]].y = 0.0;
+            }
+
+            continue; // Case 2 break
+        }
+        
+        // We have rearched the diagonal position
+        if (IC_row[i] == IC_col[i])
+        {
+            IC_val[i] = clcg_Zsqrt(clcg_Zdiff(IC_val[i], dia_sum));
+            diagonal[IC_col[i]] = IC_val[i];
+            dia_sum.x = 0.0; dia_sum.y = 0.0;
+        }
+    }
+
+    delete[] diagonal;
+    delete[] tmp_row;
+    delete[] row_st_idx;
+    delete[] filled_idx;
+    return;
+}
+
+void clcg_incomplete_Cholesky_cuda_full(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val)
+{
+    // We use this to store diagonal elements of the factorizated lower triangular matrix
+    cuDoubleComplex *diagonal = new cuDoubleComplex [N];
+    // A temporary row
+    cuDoubleComplex *tmp_row = new cuDoubleComplex [N];
+    // index of non-zero elements in tmp_row
+    int *filled_idx = new int [N];
+    // Begining index of each row in the input matrix
+    int *row_st_idx = new int [N];
+
+    size_t i, j, f, l;
+
+    // Set initial values
+    for (i = 0; i < N; i++)
+    {
+        diagonal[i].x = 0.0; diagonal[i].y = 0.0;
+        tmp_row[i].x = 0.0; tmp_row[i].y = 0.0;
+        filled_idx[i] = -1;
+        row_st_idx[i] = -1;
+    }
+
+    // copy elements to the output matrix
+    for (i = 0; i < nz_size; i++)
+    {
+        IC_row[i] = row[i];
+        IC_col[i] = col[i];
+        IC_val[i] = val[i];
+    }
+
+    // count element number in the lower triangular part (including the diagonal) and the upper triangular part (excluding the diagonal)
+    // build map from elements' cooridnate to their index in the array
+    size_t order, L_nz = 0;
+    std::map<size_t, size_t> index_map;
+
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i]) // Count number for thr lower triangular part
+        {
+            L_nz++;
+        }
+        else // Only need to build the map for the upper triangular part
+        {
+            order = N*row[i] + col[i];
+            index_map[order] = i;
+        }
+    }
+
+    // We use to store element index in the lower triangle
+    j = 0;
+    size_t *low_idx = new size_t [L_nz];
+    for (i = 0; i < nz_size; i++)
+    {
+        if (row[i] >= col[i])
+        {
+            low_idx[j] = i;
+            j++;
+        }
+    }
+
+    // Get the begining index of each row in the matrix
+    j = 1;
+    row_st_idx[0] = IC_row[0];
+    size_t old_row = IC_row[0];
+    for (i = 1; i < nz_size; i++)
+    {
+        if (IC_row[i] > old_row)
+        {
+            row_st_idx[j] = i;
+            old_row = IC_row[i];
+            j++;
+        }
+    }
+
+    // Calculate the first element
+    IC_val[0] = clcg_Zsqrt(IC_val[0]);
+    diagonal[0] = IC_val[0];
+
+    cuDoubleComplex dia_sum;
+    dia_sum.x = 0.0; dia_sum.y = 0.0;
+    // The first one is already calculated
+    for (i = 1; i < L_nz; i++)
+    {
+        l = low_idx[i];
+
+        // Calculate the first column if there is one
+        if (IC_col[l] == 0)
+        {
+            IC_val[l] = cuCdiv(IC_val[l], IC_val[0]);
+            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[l], IC_val[l]));
+            // Set value at the upper triangle
+            order = IC_row[l];
+            IC_val[index_map[order]] = IC_val[l];
+            continue; // Case 1 break
+        }
+        
+        // Calculate elements in the middle of a row
+        if (IC_row[l] > IC_col[l])
+        {
+            // Find needed values from previous elements
+            f = 0;
+            j = row_st_idx[IC_col[l]];
+            while (IC_col[j] < IC_col[l])
+            {
+                tmp_row[IC_col[j]] = IC_val[j];
+                filled_idx[f]  = IC_col[j];
+                f++;
+                j++;
+            }
+
+            j = row_st_idx[IC_row[l]];
+            while (IC_col[j] < IC_col[l])
+            {
+                IC_val[l] = clcg_Zdiff(IC_val[l], cuCmul(IC_val[j], tmp_row[IC_col[j]]));
+                j++;
+            }
+            
+            IC_val[l] = cuCdiv(IC_val[l], diagonal[IC_col[l]]);
+            dia_sum = clcg_Zsum(dia_sum, cuCmul(IC_val[l], IC_val[l]));
+
+            // Set value at the upper triangle
+            order = N*IC_col[l] + IC_row[l];
+            IC_val[index_map[order]] = IC_val[l];
+
+            // reset tmp variables
+            for (j = 0; j < f; j++)
+            {
+                tmp_row[filled_idx[j]].x = 0.0; tmp_row[filled_idx[j]].y = 0.0;
+            }
+
+            continue; // Case 2 break
+        }
+        
+        // We have rearched the diagonal position
+        if (IC_row[l] == IC_col[l])
+        {
+            IC_val[l] = clcg_Zsqrt(clcg_Zdiff(IC_val[l], dia_sum));
+            diagonal[IC_col[l]] = IC_val[l];
+            dia_sum.x = 0.0; dia_sum.y = 0.0;
+        }
+    }
+
+    delete[] diagonal;
+    delete[] tmp_row;
+    delete[] row_st_idx;
+    delete[] filled_idx;
+    delete[] low_idx;
+    index_map.clear();
+    return;
+}
--- a/src/lib/preconditioner_cuda.h
+++ b/src/lib/preconditioner_cuda.h
@@ -0,0 +1,92 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _PRECONDITIONER_CUDA_H
+#define _PRECONDITIONER_CUDA_H
+
+#include "lcg_complex_cuda.h"
+
+#ifdef LibLCG_CUDA
+
+/**
+ * @brief Return the number of non-zero elements in the lower triangular part of the input matrix
+ * 
+ * @param row[in]        Row index of the input sparse matrix.
+ * @param col[in]        Column index of the input sparse matrix.
+ * @param nz_size[in]    Length of the non-zero elements.
+ * @param lnz_size[out]  Legnth of the non-zero elements in the lower triangle
+ */
+void clcg_incomplete_Cholesky_cuda_half_buffsize(const int *row, const int *col, int nz_size, int *lnz_size);
+
+/**
+ * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
+ * 
+ * @note  Only the factorized lower triangular matrix is stored in the lower part of the output matrix accordingly.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param val        Non-zero values of the input sparse matrix.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zero elements.
+ * @param lnz_size   Legnth of the non-zero elements in the lower triangle
+ * @param IC_row     Row index of the factorized triangular sparse matrix.
+ * @param IC_col     Column index of the factorized triangular sparse matrix.
+ * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
+ */
+void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuComplex *val, int N, int nz_size, int lnz_size, int *IC_row, int *IC_col, cuComplex *IC_val);
+
+/**
+ * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
+ * 
+ * @note  Only the factorized lower triangular matrix is stored in the lower part of the output matrix accordingly.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param val        Non-zero values of the input sparse matrix.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zero elements.
+ * @param lnz_size   Legnth of the non-zero elements in the lower triangle
+ * @param IC_row     Row index of the factorized triangular sparse matrix.
+ * @param IC_col     Column index of the factorized triangular sparse matrix.
+ * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
+ */
+void clcg_incomplete_Cholesky_cuda_half(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, int lnz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val);
+
+/**
+ * @brief Preform the incomplete Cholesky factorization for a sparse matrix that is saved in the COO format.
+ * 
+ * @note  The factorized lower and upper triangular matrixes are stored in the lower and upper triangular parts of the output matrix accordingly.
+ * 
+ * @param row        Row index of the input sparse matrix.
+ * @param col        Column index of the input sparse matrix.
+ * @param val        Non-zero values of the input sparse matrix.
+ * @param N          Row/Column size of the sparse matrix.
+ * @param nz_size    Length of the non-zeor elements.
+ * @param IC_row     Row index of the factorized triangular sparse matrix.
+ * @param IC_col     Column index of the factorized triangular sparse matrix.
+ * @param IC_val     Non-zero values of the factorized triangular sparse matrix.
+ */
+void clcg_incomplete_Cholesky_cuda_full(const int *row, const int *col, const cuDoubleComplex *val, int N, int nz_size, int *IC_row, int *IC_col, cuDoubleComplex *IC_val);
+
+#endif // LibLCG_CUDA
+
+#endif // _PRECONDITIONER_CUDA_H
--- a/src/lib/preconditioner_eigen.cpp
+++ b/src/lib/preconditioner_eigen.cpp
--- a/src/lib/preconditioner_eigen.h
+++ b/src/lib/preconditioner_eigen.h
@@ -0,0 +1,159 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _PRECONDITIONER_EIGEN_H
+#define _PRECONDITIONER_EIGEN_H
+
+#include "complex"
+#include "Eigen/Dense"
+#include "Eigen/SparseCore"
+
+
+/**
+ * @brief     Perform the Cholesky decomposition and return the lower triangular matrix.
+ * 
+ * @note      This could serve as a direct solver.
+ * 
+ * @param A   The input matrix. Must be full rank and symmetric (aka. A = A^T)
+ * @param L   The output low triangular matrix
+ */
+void lcg_Cholesky(const Eigen::MatrixXd &A, Eigen::MatrixXd &L);
+
+/**
+ * @brief      Perform the Cholesky decomposition and return the lower triangular matrix
+ * 
+ * @note       This could serve as a direct solver.
+ *
+ * @param[in]  A     The input matrix. Must be full rank and symmetric (aka. A = A^T)
+ * @param      L     The output low triangular matrix
+ */
+void clcg_Cholesky(const Eigen::MatrixXcd &A, Eigen::MatrixXcd &L);
+
+/**
+ * @brief      Calculate the invert of a lower triangle matrix (Full rank only).
+ *
+ * @param      L     The operating lower triangle matrix
+ * @param      Linv  The inverted lower triangle matrix
+ */
+void lcg_invert_lower_triangle(const Eigen::MatrixXd &L, Eigen::MatrixXd &Linv);
+
+/**
+ * @brief      Calculate the invert of a upper triangle matrix (Full rank only).
+ *
+ * @param      U     The operating upper triangle matrix
+ * @param      Uinv  The inverted upper triangle matrix
+ */
+void lcg_invert_upper_triangle(const Eigen::MatrixXd &U, Eigen::MatrixXd &Uinv);
+
+/**
+ * @brief      Calculate the invert of a lower triangle matrix (Full rank only).
+ *
+ * @param      L     The operating lower triangle matrix
+ * @param      Linv  The inverted lower triangle matrix
+ */
+void clcg_invert_lower_triangle(const Eigen::MatrixXcd &L, Eigen::MatrixXcd &Linv);
+
+/**
+ * @brief      Calculate the invert of a upper triangle matrix (Full rank only).
+ *
+ * @param      U     The operating upper triangle matrix
+ * @param      Uinv  The inverted upper triangle matrix
+ */
+void clcg_invert_upper_triangle(const Eigen::MatrixXcd &U, Eigen::MatrixXcd &Uinv);
+
+/**
+ * @brief      Calculate the incomplete Cholesky decomposition and return the lower triangular matrix
+ *
+ * @param[in]  A     The input sparse matrix. Must be full rank and symmetric (aka. A = A^T)
+ * @param      L     The output lower triangular matrix
+ * @param      fill  The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
+ */
+void lcg_incomplete_Cholesky(const Eigen::SparseMatrix<double, Eigen::RowMajor> &A, Eigen::SparseMatrix<double, Eigen::RowMajor> &L, size_t fill = 0);
+
+/**
+ * @brief      Calculate the incomplete Cholesky decomposition and return the lower triangular matrix
+ *
+ * @param[in]  A     The input sparse matrix. Must be full rank and symmetric (aka. A = A^T)
+ * @param      L     The output lower triangular matrix
+ * @param      fill  The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
+ */
+void clcg_incomplete_Cholesky(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &A, Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &L, size_t fill = 0);
+
+/**
+ * @brief        Calculate the incomplete LU factorizations
+ * 
+ * @param A      The input sparse matrix. Must be full rank.
+ * @param L      The output lower triangular matrix.
+ * @param U      The output upper triangular matrix.
+ * @param fill   The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
+ */
+void lcg_incomplete_LU(const Eigen::SparseMatrix<double, Eigen::RowMajor> &A, Eigen::SparseMatrix<double, Eigen::RowMajor> &L, Eigen::SparseMatrix<double, Eigen::RowMajor> &U, size_t fill = 0);
+
+/**
+ * @brief        Calculate the incomplete LU factorizations
+ * 
+ * @param A      The input sparse matrix. Must be full rank.
+ * @param L      The output lower triangular matrix.
+ * @param U      The output upper triangular matrix.
+ * @param fill   The fill-in number of the output sparse matrix. No fill-in reduction will be processed if this variable is set to zero.
+ */
+void clcg_incomplete_LU(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &A, Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &L, 
+    Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &U, size_t fill = 0);
+
+/**
+ * @brief    Solve the linear system Lx = B, in which L is a lower triangle matrix.
+ * 
+ * @param L  The input lower triangle matrix
+ * @param B  The object vector
+ * @param X  The solution vector
+ */
+void lcg_solve_lower_triangle(const Eigen::SparseMatrix<double, Eigen::RowMajor> &L, const Eigen::VectorXd &B, Eigen::VectorXd &X);
+
+/**
+ * @brief    Solve the linear system Ux = B, in which U is a upper triangle matrix.
+ * 
+ * @param U  The input upper triangle matrix
+ * @param B  The object vector
+ * @param X  The solution vector
+ */
+void lcg_solve_upper_triangle(const Eigen::SparseMatrix<double, Eigen::RowMajor> &U, const Eigen::VectorXd &B, Eigen::VectorXd &X);
+
+/**
+ * @brief    Solve the linear system Lx = B, in which L is a lower triangle matrix.
+ * 
+ * @param L  The input lower triangle matrix
+ * @param B  The object vector
+ * @param X  The solution vector
+ */
+void clcg_solve_lower_triangle(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &L, const Eigen::VectorXcd &B, Eigen::VectorXcd &X);
+
+/**
+ * @brief    Solve the linear system Ux = B, in which U is a upper triangle matrix.
+ * 
+ * @param U  The input upper triangle matrix
+ * @param B  The object vector
+ * @param X  The solution vector
+ */
+void clcg_solve_upper_triangle(const Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> &U, const Eigen::VectorXcd &B, Eigen::VectorXcd &X);
+
+
+#endif // _PRECONDITIONER_EIGEN_H
--- a/src/lib/solver.cpp
+++ b/src/lib/solver.cpp
@@ -0,0 +1,311 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "solver.h"
+
+#include "ctime"
+#include "iostream"
+
+#include "config.h"
+#ifdef LibLCG_OPENMP
+#include "omp.h"
+#endif
+
+LCG_Solver::LCG_Solver()
+{
+	param_ = lcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int LCG_Solver::Progress(const lcg_float* m, const lcg_float converge, 
+	const lcg_para *param, const int n_size, const int k)
+{
+	if (inter_ > 0 && k%inter_ == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void LCG_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void LCG_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void LCG_Solver::set_lcg_parameter(const lcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void LCG_Solver::Minimize(lcg_float *m, const lcg_float *b, int x_size, 
+	lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver(_AxProduct, nullptr, m, b, x_size, &param_, this, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = lcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = lcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_CG:
+				std::clog << "Solver: CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_CGS:
+				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_BICGSTAB:
+				std::clog << "Solver: BICGSTAB. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_BICGSTAB2:
+				std::clog << "Solver: BICGSTAB2. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void LCG_Solver::MinimizePreconditioned(lcg_float *m, const lcg_float *b, int x_size, 
+	lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_preconditioned(_AxProduct, _MxProduct, nullptr, m, b, x_size, &param_, this, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = lcg_solver_preconditioned(_AxProduct, _MxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = lcg_solver_preconditioned(_AxProduct, _MxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_PCG:
+				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void LCG_Solver::MinimizeConstrained(lcg_float *m, const lcg_float *b, const lcg_float* low, 
+	const lcg_float *hig, int x_size, lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_constrained(_AxProduct, nullptr, m, b, low, hig, x_size, &param_, this, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = lcg_solver_constrained(_AxProduct, _Progress, m, b, low, hig, x_size, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = lcg_solver_constrained(_AxProduct, _Progress, m, b, low, hig, x_size, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_PG:
+				std::clog << "Solver: PG-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_SPG:
+				std::clog << "Solver: SPG-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+
+CLCG_Solver::CLCG_Solver()
+{
+	param_ = clcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int CLCG_Solver::Progress(const lcg_complex* m, const lcg_float converge, 
+	const clcg_para* param, const int n_size, const int k)
+{
+	if (inter_ > 0 && k%inter_ == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void CLCG_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void CLCG_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void CLCG_Solver::set_clcg_parameter(const clcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void CLCG_Solver::Minimize(lcg_complex *m, const lcg_complex *b, int x_size, 
+	clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver(_AxProduct, nullptr, m, b, x_size, &param_, this, solver_id);
+		if (ret < 0) clcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = clcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = clcg_solver(_AxProduct, _Progress, m, b, x_size, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_BICG:
+				std::clog << "Solver: Bi-CG. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_BICG_SYM:
+				std::clog << "Solver: Bi-CG (symmetrically accelerated). Times cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_CGS:
+				std::clog << "Solver: CGS. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_TFQMR:
+				std::clog << "Solver: TFQMR. Times cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Times cost: " << costime << " ms" << std::endl;
+				break;
+		}
+	}
+
+	if (verbose) clcg_error_str(ret, er_throw);
+	else if (ret < 0) clcg_error_str(ret, er_throw);
+	return;
+}
--- a/src/lib/solver.h
+++ b/src/lib/solver.h
@@ -0,0 +1,285 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _SOLVER_H
+#define _SOLVER_H
+
+#include "lcg.h"
+#include "clcg.h"
+
+/**
+ * @brief      Linear conjugate gradient solver class
+ */
+class LCG_Solver
+{
+protected:
+	lcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	LCG_Solver();
+	virtual ~LCG_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param a[in]      Pointer of the multiplier
+	 * @param b[out]     Pointer of the product
+	 * @param num        Size of the array
+	 */
+	static void _AxProduct(void* instance, const lcg_float* a, lcg_float* b, const int num)
+	{
+		return reinterpret_cast<LCG_Solver*>(instance)->AxProduct(a, b, num);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of A*x
+	 * 
+	 * @param a[in]     Pointer of the multiplier
+	 * @param b[out]    Pointer of the product
+	 * @param num   Size of the array
+	 */
+	virtual void AxProduct(const lcg_float* a, lcg_float* b, const int num) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param a[in]      Pointer of the multiplier
+	 * @param b[out]     Pointer of the product
+	 * @param num        Size of the array
+	 */
+	static void _MxProduct(void* instance, const lcg_float* a, lcg_float* b, const int num)
+	{
+		return reinterpret_cast<LCG_Solver*>(instance)->MxProduct(a, b, num);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of M^-1*x
+	 * 
+	 * @param a[in]     Pointer of the multiplier
+	 * @param b[out]    Pointer of the product
+	 * @param num   Size of the array
+	 */
+	virtual void MxProduct(const lcg_float* a, lcg_float* b, const int num) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const lcg_float* m, const lcg_float converge, 
+		const lcg_para *param, const int n_size, const int k)
+	{
+		return reinterpret_cast<LCG_Solver*>(instance)->Progress(m, converge, param, n_size, k);
+	}
+
+	/**
+	 * @brief       Virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const lcg_float* m, const lcg_float converge, 
+		const lcg_para *param, const int n_size, const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the parameters of the algorithms
+	 * 
+	 * @param in_param   the input parameters
+	 */
+	void set_lcg_parameter(const lcg_para &in_param);
+
+	/**
+	 * @brief      Run the minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param x_size     Size of the solution vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(lcg_float *m, const lcg_float *b, int x_size, 
+		lcg_solver_enum solver_id = LCG_CG, bool verbose = true, bool er_throw = false);
+
+	/**
+	 * @brief      Run the preconitioned minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param x_size     Size of the solution vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizePreconditioned(lcg_float *m, const lcg_float *b, int x_size, 
+		lcg_solver_enum solver_id = LCG_PCG, bool verbose = true, bool er_throw = false);
+
+	/**
+	 * @brief      Run the constrained minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param low        Lower bound of the solution vector
+	 * @param hig        Higher bound of the solution vector
+	 * @param x_size     Size of the solution vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizeConstrained(lcg_float *m, const lcg_float *b, const lcg_float* low, 
+		const lcg_float *hig, int x_size, lcg_solver_enum solver_id = LCG_PG, 
+		bool verbose = true, bool er_throw = false);
+};
+
+/**
+ * @brief      Complex linear conjugate gradient solver class
+ */
+class CLCG_Solver
+{
+protected:
+	clcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	CLCG_Solver();
+	virtual ~CLCG_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param x_size     Size of the array
+	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
+	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
+	 */
+	static void _AxProduct(void *instance, const lcg_complex *x, lcg_complex *prod_Ax, 
+		const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		return reinterpret_cast<CLCG_Solver*>(instance)->AxProduct(x, prod_Ax, x_size, layout, conjugate);
+	}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param x_size     Size of the array
+	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
+	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
+	 */
+	virtual void AxProduct(const lcg_complex *x, lcg_complex *prod_Ax, 
+		const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const lcg_complex* m, const lcg_float converge, 
+		const clcg_para* param, const int n_size, const int k)
+	{
+		return reinterpret_cast<CLCG_Solver*>(instance)->Progress(m, converge, param, n_size, k);
+	}
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const lcg_complex* m, const lcg_float converge, 
+		const clcg_para* param, const int n_size, const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the parameters of the algorithms
+	 * 
+	 * @param in_param   the input parameters
+	 */
+	void set_clcg_parameter(const clcg_para &in_param);
+
+	/**
+	 * @brief      Run the minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param x_size     Size of the solution vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(lcg_complex *m, const lcg_complex *b, int x_size, 
+		clcg_solver_enum solver_id = CLCG_CGS, bool verbose = true, 
+		bool er_throw = false);
+};
+
+#endif // _SOLVER_H
--- a/src/lib/solver_cuda.cu
+++ b/src/lib/solver_cuda.cu
@@ -0,0 +1,414 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "solver_cuda.h"
+
+#include "cmath"
+#include "ctime"
+#include "iostream"
+
+LCG_CUDA_Solver::LCG_CUDA_Solver()
+{
+	param_ = lcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int LCG_CUDA_Solver::Progress(const lcg_float* m, const lcg_float converge, 
+	const lcg_para* param, const int n_size, const int nz_size, const int k)
+{
+	if (inter_ > 0 && k%inter_ == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void LCG_CUDA_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void LCG_CUDA_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void LCG_CUDA_Solver::set_lcg_parameter(const lcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void LCG_CUDA_Solver::Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
+	const int n_size, const int nz_size, lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_cuda(_AxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = lcg_solver_cuda(_AxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_CG:
+				std::clog << "Solver: CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_CGS:
+				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void LCG_CUDA_Solver::MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
+    const int n_size, const int nz_size, lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = lcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_PCG:
+				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void LCG_CUDA_Solver::MinimizeConstrained(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, const lcg_float *b, 
+    const lcg_float* low, const lcg_float *hig, const int n_size, const int nz_size, lcg_solver_enum solver_id, 
+    bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_constrained_cuda(_AxProduct, nullptr, x, b, low, hig, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = lcg_solver_constrained_cuda(_AxProduct, _Progress, x, b, low, hig, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_PG:
+				std::clog << "Solver: PG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+
+CLCG_CUDAF_Solver::CLCG_CUDAF_Solver()
+{
+	param_ = clcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int CLCG_CUDAF_Solver::Progress(const cuComplex* m, const float converge, 
+	const clcg_para* param, const int n_size, const int nz_size, const int k)
+{
+	if (inter_ > 0 && k%inter_ == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void CLCG_CUDAF_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void CLCG_CUDAF_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void CLCG_CUDAF_Solver::set_clcg_parameter(const clcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void CLCG_CUDAF_Solver::Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
+	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver_cuda(_AxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = clcg_solver_cuda(_AxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_BICG:
+				std::clog << "Solver: BI-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_BICG_SYM:
+				std::clog << "Solver: BI-CG (symmetrically accelerated). Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void CLCG_CUDAF_Solver::MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
+	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_PCG:
+				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+
+CLCG_CUDA_Solver::CLCG_CUDA_Solver()
+{
+	param_ = clcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int CLCG_CUDA_Solver::Progress(const cuDoubleComplex* m, const lcg_float converge, 
+	const clcg_para* param, const int n_size, const int nz_size, const int k)
+{
+	if (inter_ > 0 && k%inter_ == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void CLCG_CUDA_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void CLCG_CUDA_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void CLCG_CUDA_Solver::set_clcg_parameter(const clcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void CLCG_CUDA_Solver::Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
+	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver_cuda(_AxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = clcg_solver_cuda(_AxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_BICG:
+				std::clog << "Solver: BI-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_BICG_SYM:
+				std::clog << "Solver: BI-CG (symmetrically accelerated). Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void CLCG_CUDA_Solver::MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
+	const int n_size, const int nz_size, clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, nullptr, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+	
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+	clock_t start = clock();
+	int ret = clcg_solver_preconditioned_cuda(_AxProduct, _MxProduct, _Progress, x, b, n_size, nz_size, &param_, this, cub_handle, cus_handle, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+	
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_PCG:
+				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
--- a/src/lib/solver_cuda.h
+++ b/src/lib/solver_cuda.h
@@ -0,0 +1,545 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _SOLVER_CUDA_H
+#define _SOLVER_CUDA_H
+
+#include "lcg_cuda.h"
+#include "clcg_cuda.h"
+#include "clcg_cudaf.h"
+
+#ifdef LibLCG_CUDA
+
+/**
+ * @brief      Linear conjugate gradient solver class
+ */
+class LCG_CUDA_Solver
+{
+protected:
+	lcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	LCG_CUDA_Solver();
+	virtual ~LCG_CUDA_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	static void _AxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
+	{
+		return reinterpret_cast<LCG_CUDA_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Ax, n_size, nz_size);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of A*x
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]     Pointer of the multiplier
+	 * @param prod_Ax[out]    Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	virtual void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	static void _MxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, const int n_size, const int nz_size)
+	{
+		return reinterpret_cast<LCG_CUDA_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Mx, n_size, nz_size);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of M^-1*x
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]     Pointer of the multiplier
+	 * @param prod_Mx[out]    Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	virtual void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+        cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, const int n_size, const int nz_size) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const lcg_float* m, const lcg_float converge, 
+	    const lcg_para* param, const int n_size, const int nz_size, const int k)
+	{
+		return reinterpret_cast<LCG_CUDA_Solver*>(instance)->Progress(m, converge, param, n_size, nz_size, k);
+	}
+	
+	/**
+	 * @brief       Virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const lcg_float* m, const lcg_float converge, 
+	    const lcg_para* param, const int n_size, const int nz_size, const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the parameters of the algorithms
+	 * 
+	 * @param in_param   the input parameters
+	 */
+	void set_lcg_parameter(const lcg_para &in_param);
+
+	/**
+	 * @brief      Run the constrained minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
+        const int n_size, const int nz_size, lcg_solver_enum solver_id = LCG_CG, bool verbose = true, bool er_throw = false);
+	
+	/**
+	 * @brief      Run the preconditioned minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, lcg_float *b, 
+        const int n_size, const int nz_size, lcg_solver_enum solver_id = LCG_CG, bool verbose = true, bool er_throw = false);
+	
+	/**
+	 * @brief      Run the constrained minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param low        Lower bound of the solution vector
+	 * @param hig        Higher bound of the solution vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+    void MinimizeConstrained(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, lcg_float *x, const lcg_float *b, 
+        const lcg_float* low, const lcg_float *hig, const int n_size, const int nz_size, lcg_solver_enum solver_id = LCG_PG, 
+        bool verbose = true, bool er_throw = false);
+};
+
+
+/**
+ * @brief      Complex linear conjugate gradient solver class
+ */
+class CLCG_CUDAF_Solver
+{
+protected:
+	clcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	CLCG_CUDAF_Solver();
+	virtual ~CLCG_CUDAF_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	static void _AxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		return reinterpret_cast<CLCG_CUDAF_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Ax, n_size, nz_size, oper_t);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of A*x
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	virtual void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	static void _MxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		return reinterpret_cast<CLCG_CUDAF_Solver*>(instance)->MxProduct(cub_handle, cus_handle, x, prod_Mx, n_size, nz_size, oper_t);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of M^-1*x
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	virtual void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const cuComplex* m, const float converge, 
+	    const clcg_para* param, const int n_size, const int nz_size, const int k)
+	{
+		return reinterpret_cast<CLCG_CUDAF_Solver*>(instance)->Progress(m, converge, param, n_size, nz_size, k);
+	}
+
+	/**
+	 * @brief       Virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const cuComplex* m, const float converge, 
+	    const clcg_para* param, const int n_size, const int nz_size, const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the parameters of the algorithms
+	 * 
+	 * @param in_param   the input parameters
+	 */
+	void set_clcg_parameter(const clcg_para &in_param);
+
+	/**
+	 * @brief      Run the constrained minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
+		const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_BICG, bool verbose = true, bool er_throw = false);
+	
+	/**
+	 * @brief      Run the preconditioned minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuComplex *x, cuComplex *b, 
+        const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_PCG, bool verbose = true, bool er_throw = false);
+};
+
+
+/**
+ * @brief      Complex linear conjugate gradient solver class
+ */
+class CLCG_CUDA_Solver
+{
+protected:
+	clcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	CLCG_CUDA_Solver();
+	virtual ~CLCG_CUDA_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	static void _AxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		return reinterpret_cast<CLCG_CUDA_Solver*>(instance)->AxProduct(cub_handle, cus_handle, x, prod_Ax, n_size, nz_size, oper_t);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of A*x
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	virtual void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	static void _MxProduct(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		return reinterpret_cast<CLCG_CUDA_Solver*>(instance)->MxProduct(cub_handle, cus_handle, x, prod_Mx, n_size, nz_size, oper_t);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of M^-1*x
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param oper_t      Cusparse operator. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 */
+	virtual void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Mx, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
+	    const clcg_para* param, const int n_size, const int nz_size, const int k)
+	{
+		return reinterpret_cast<CLCG_CUDA_Solver*>(instance)->Progress(m, converge, param, n_size, nz_size, k);
+	}
+
+	/**
+	 * @brief       Virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param n_size      Size of the solution
+	 * @param nz_size     Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const cuDoubleComplex* m, const lcg_float converge, 
+	    const clcg_para* param, const int n_size, const int nz_size, const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the parameters of the algorithms
+	 * 
+	 * @param in_param   the input parameters
+	 */
+	void set_clcg_parameter(const clcg_para &in_param);
+
+	/**
+	 * @brief      Run the constrained minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
+		const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_BICG, bool verbose = true, bool er_throw = false);
+	
+	/**
+	 * @brief      Run the preconditioned minimizing process
+	 * 
+	 * @param cub_handle  Handler of the CuBLAS library
+	 * @param cus_handle  Handler of the CuSparse library
+	 * @param x          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param n_size     Size of the solution vector
+	 * @param nz_size    Non-zero size of the sparse kernel matrix. This parameter is not need by the algorithm. It is passed for CUDA usages
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizePreconditioned(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cuDoubleComplex *x, cuDoubleComplex *b, 
+        const int n_size, const int nz_size, clcg_solver_enum solver_id = CLCG_PCG, bool verbose = true, bool er_throw = false);
+};
+
+#endif // LibLCG_CUDA
+
+#endif // _SOLVER_CUDA_H
--- a/src/lib/solver_eigen.cpp
+++ b/src/lib/solver_eigen.cpp
@@ -0,0 +1,365 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "solver_eigen.h"
+
+#include "cmath"
+#include "ctime"
+#include "iostream"
+
+#include "config.h"
+#ifdef LibLCG_OPENMP
+#include "omp.h"
+#endif
+
+LCG_EIGEN_Solver::LCG_EIGEN_Solver()
+{
+	param_ = lcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int LCG_EIGEN_Solver::Progress(const Eigen::VectorXd *m, const lcg_float converge, const lcg_para *param, 
+	const int k)
+{
+	if (inter_ > 0 && k%inter_ == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void LCG_EIGEN_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void LCG_EIGEN_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void LCG_EIGEN_Solver::set_lcg_parameter(const lcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void LCG_EIGEN_Solver::Minimize(Eigen::VectorXd &m, const Eigen::VectorXd &b, 
+	lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_eigen(_AxProduct, nullptr, m, b, &param_, this, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = lcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = lcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_CG:
+				std::clog << "Solver: CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_CGS:
+				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_BICGSTAB:
+				std::clog << "Solver: BICGSTAB. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_BICGSTAB2:
+				std::clog << "Solver: BICGSTAB2. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void LCG_EIGEN_Solver::MinimizePreconditioned(Eigen::VectorXd &m, const Eigen::VectorXd &b, 
+	lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, nullptr, m, b, &param_, this, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = lcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, _Progress, m, b, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = lcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_PCG:
+				std::clog << "Solver: PCG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}	
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+void LCG_EIGEN_Solver::MinimizeConstrained(Eigen::VectorXd &m, const Eigen::VectorXd &B, const Eigen::VectorXd &low, 
+	const Eigen::VectorXd &hig, lcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = lcg_solver_constrained_eigen(_AxProduct, nullptr, m, B, low, hig, &param_, this, solver_id);
+		if (ret < 0) lcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double start = omp_get_wtime();
+	int ret = lcg_solver_constrained_eigen(_AxProduct, _Progress, m, B, low, hig, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = lcg_solver_constrained_eigen(_AxProduct, _Progress, m, B, low, hig, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case LCG_PG:
+				std::clog << "Solver: PG-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case LCG_SPG:
+				std::clog << "Solver: SPG-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}
+	}
+
+	if (verbose) lcg_error_str(ret, er_throw);
+	else if (ret < 0) lcg_error_str(ret, er_throw);
+	return;
+}
+
+
+CLCG_EIGEN_Solver::CLCG_EIGEN_Solver()
+{
+	param_ = clcg_default_parameters();
+	inter_ = 1;
+	silent_ = false;
+}
+
+int CLCG_EIGEN_Solver::Progress(const Eigen::VectorXcd *m, const lcg_float converge, const clcg_para *param, 
+	const int k)
+{
+	if (inter_ > 0 && (k%inter_) == 0)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+		return 0;
+	}
+
+	if (converge <= param->epsilon)
+	{
+		std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	}
+	return 0;
+}
+
+void CLCG_EIGEN_Solver::silent()
+{
+	silent_ = true;
+	return;
+}
+
+void CLCG_EIGEN_Solver::set_clcg_parameter(const clcg_para &in_param)
+{
+	param_ = in_param;
+	return;
+}
+
+void CLCG_EIGEN_Solver::set_report_interval(unsigned int inter)
+{
+	inter_ = inter;
+	return;
+}
+
+void CLCG_EIGEN_Solver::Minimize(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, 
+	clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver_eigen(_AxProduct, nullptr, m, b, &param_, this, solver_id);
+		if (ret < 0) clcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double  start = omp_get_wtime();
+	int ret = clcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = clcg_solver_eigen(_AxProduct, _Progress, m, b, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_BICG:
+				std::clog << "Solver: BI-CG. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_BICG_SYM:
+				std::clog << "Solver: BI-CG (symmetrically accelerated). Time cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_CGS:
+				std::clog << "Solver: CGS. Time cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_TFQMR:
+				std::clog << "Solver: TFQMR. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_PCG:
+				std::clog << "Solver: PCG. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_PBICG:
+				std::clog << "Solver: PBICG. Times cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}
+	}
+
+	if (verbose) clcg_error_str(ret, er_throw);
+	else if (ret < 0) clcg_error_str(ret, er_throw);
+	return;
+}
+
+void CLCG_EIGEN_Solver::MinimizePreconditioned(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, 
+	clcg_solver_enum solver_id, bool verbose, bool er_throw)
+{
+	if (silent_)
+	{
+		int ret = clcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, nullptr, m, b, &param_, this, solver_id);
+		if (ret < 0) clcg_error_str(ret, true);
+		return;
+	}
+
+	// 使用lcg求解 注意当我们使用函数指针来调用求解函数时默认参数不可以省略
+#ifdef LibLCG_OPENMP
+	double  start = omp_get_wtime();
+	int ret = clcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, _Progress, m, b, &param_, this, solver_id);
+	double end = omp_get_wtime();
+
+	lcg_float costime = 1000*(end-start);
+#else
+	clock_t start = clock();
+	int ret = clcg_solver_preconditioned_eigen(_AxProduct, _MxProduct, _Progress, m, b, &param_, this, solver_id);
+	clock_t end = clock();
+
+	lcg_float costime = 1000*(end-start)/(double)CLOCKS_PER_SEC;
+#endif
+
+	if (!er_throw)
+	{
+		std::clog << std::endl;
+		switch (solver_id)
+		{
+			case CLCG_PCG:
+				std::clog << "Solver: PCG. Times cost: " << costime << " ms" << std::endl;
+				break;
+			case CLCG_PBICG:
+				std::clog << "Solver: PBICG. Times cost: " << costime << " ms" << std::endl;
+				break;
+			default:
+				std::clog << "Solver: Unknown. Time cost: " << costime << " ms" << std::endl;
+				break;
+		}
+	}
+
+	if (verbose) clcg_error_str(ret, er_throw);
+	else if (ret < 0) clcg_error_str(ret, er_throw);
+	return;
+}
--- a/src/lib/solver_eigen.h
+++ b/src/lib/solver_eigen.h
@@ -0,0 +1,308 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _SOLVER_EIGEN_H
+#define _SOLVER_EIGEN_H
+
+#include "lcg_eigen.h"
+#include "clcg_eigen.h"
+
+/**
+ * @brief      Linear conjugate gradient solver class
+ */
+class LCG_EIGEN_Solver
+{
+protected:
+	lcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	LCG_EIGEN_Solver();
+	virtual ~LCG_EIGEN_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 */
+	static void _AxProduct(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax)
+	{
+		return reinterpret_cast<LCG_EIGEN_Solver*>(instance)->AxProduct(x, prod_Ax);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of A*x
+	 * 
+	 * @param x[in]     Pointer of the multiplier
+	 * @param prod_Ax[out]    Pointer of the product
+	 */
+	virtual void AxProduct(const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 */
+	static void _MxProduct(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Mx)
+	{
+		return reinterpret_cast<LCG_EIGEN_Solver*>(instance)->MxProduct(x, prod_Mx);
+	}
+
+	/**
+	 * @brief       Virtual function of the product of M^-1*x
+	 * 
+	 * @param x[in]     Pointer of the multiplier
+	 * @param prod_Mx[out]    Pointer of the product
+	 */
+	virtual void MxProduct(const Eigen::VectorXd &x, Eigen::VectorXd &prod_Mx) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const Eigen::VectorXd *m, const lcg_float converge, 
+		const lcg_para *param, const int k)
+	{
+		return reinterpret_cast<LCG_EIGEN_Solver*>(instance)->Progress(m, converge, param, k);
+	}
+
+	/**
+	 * @brief       Virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const Eigen::VectorXd *m, const lcg_float converge, const lcg_para *param, 
+		const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the parameters of the algorithms
+	 * 
+	 * @param in_param   the input parameters
+	 */
+	void set_lcg_parameter(const lcg_para &in_param);
+
+	/**
+	 * @brief      Run the minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(Eigen::VectorXd &m, const Eigen::VectorXd &b, lcg_solver_enum solver_id = LCG_CG, 
+		bool verbose = true, bool er_throw = false);
+	
+	/**
+	 * @brief      Run the preconitioned minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizePreconditioned(Eigen::VectorXd &m, const Eigen::VectorXd &b, lcg_solver_enum solver_id = LCG_PCG, 
+		bool verbose = true, bool er_throw = false);
+
+	/**
+	 * @brief      Run the constrained minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param low        Lower bound of the solution vector
+	 * @param hig        Higher bound of the solution vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void MinimizeConstrained(Eigen::VectorXd &m, const Eigen::VectorXd &B, const Eigen::VectorXd &low, 
+		const Eigen::VectorXd &hig, lcg_solver_enum solver_id = LCG_PG, bool verbose = true, 
+		bool er_throw = false);
+};
+
+/**
+ * @brief      Complex linear conjugate gradient solver class
+ */
+class CLCG_EIGEN_Solver
+{
+protected:
+	clcg_para param_;
+	unsigned int inter_;
+	bool silent_;
+
+public:
+	CLCG_EIGEN_Solver();
+	virtual ~CLCG_EIGEN_Solver(){}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
+	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
+	 */
+	static void _AxProduct(void* instance, const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
+		lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		return reinterpret_cast<CLCG_EIGEN_Solver*>(instance)->AxProduct(x, prod_Ax, layout, conjugate);
+	}
+
+	/**
+	 * @brief       Interface of the virtual function of the product of A*x
+	 * 
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Ax[out]     Pointer of the product
+	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
+	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
+	 */
+	virtual void AxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
+		lcg_matrix_e layout, clcg_complex_e conjugate) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param instance   User data sent to identify the function address
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
+	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
+	 */
+    static void _MxProduct(void* instance, const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Mx, 
+        lcg_matrix_e layout, clcg_complex_e conjugate)
+    {
+        return reinterpret_cast<CLCG_EIGEN_Solver*>(instance)->MxProduct(x, prod_Mx, layout, conjugate);
+    }
+
+	/**
+	 * @brief       Interface of the virtual function of the product of M^-1*x
+	 * 
+	 * @param x[in]      Pointer of the multiplier
+	 * @param prod_Mx[out]     Pointer of the product
+	 * @param layout     Layout of the kernel matrix. This is passed for the clcg_matvec() function
+	 * @param conjugate  Welther to use conjugate of the kernel matrix. This is passed for the clcg_matvec() function
+	 */
+    virtual void MxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Mx, 
+        lcg_matrix_e layout, clcg_complex_e conjugate) = 0;
+
+	/**
+	 * @brief       Interface of the virtual function of the process monitoring
+	 * 
+	 * @param instance    User data sent to identify the function address
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	static int _Progress(void* instance, const Eigen::VectorXcd *m, const lcg_float converge, 
+		const clcg_para *param, const int k)
+	{
+		return reinterpret_cast<CLCG_EIGEN_Solver*>(instance)->Progress(m, converge, param, k);
+	}
+
+	/**
+	 * @brief       Virtual function of the process monitoring
+	 * 
+	 * @param m           Pointer of the current solution
+	 * @param converge    Current value of the convergence
+	 * @param param       Pointer of the parameters used in the algorithms
+	 * @param k           Current iteration times
+	 * @return int        Status of the process
+	 */
+	virtual int Progress(const Eigen::VectorXcd *m, const lcg_float converge, const clcg_para *param, 
+		const int k);
+
+	/**
+	 * @brief      Do not report any processes
+	 */
+	void silent();
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_report_interval(unsigned int inter);
+
+	/**
+	 * @brief      Set the interval to run the process monitoring function
+	 * 
+	 * @param inter      the interval
+	 */
+	void set_clcg_parameter(const clcg_para &in_param);
+	
+	/**
+	 * @brief      Run the minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+	void Minimize(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, clcg_solver_enum solver_id = CLCG_CGS, 
+		bool verbose = true, bool er_throw = false);
+
+	/**
+	 * @brief      Run the preconitioned minimizing process
+	 * 
+	 * @param m          Pointer of the solution vector
+	 * @param b          Pointer of the targeting vector
+	 * @param solver_id  Solver type
+	 * @param verbose    Report more information of the full process
+	 * @param er_throw   Instead of showing error messages on screen, throw them out using std::exception
+	 */
+    void MinimizePreconditioned(Eigen::VectorXcd &m, const Eigen::VectorXcd &b, clcg_solver_enum solver_id = CLCG_PBICG, 
+        bool verbose = true, bool er_throw = false);
+};
+
+#endif // _SOLVER_EIGEN_H
--- a/src/lib/util.cpp
+++ b/src/lib/util.cpp
@@ -0,0 +1,253 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "iostream"
+#include "exception"
+#include "stdexcept"
+
+#include "util.h"
+
+#if defined _WINDOWS || __WIN32__
+#include "windows.h"
+#endif
+
+lcg_para lcg_default_parameters()
+{
+	lcg_para param = defparam;
+	return param;
+}
+
+lcg_solver_enum lcg_select_solver(std::string slr_char)
+{
+	lcg_solver_enum slr_id;
+	if (slr_char == "LCG_CG") slr_id = LCG_CG;
+	else if (slr_char == "LCG_PCG") slr_id = LCG_PCG;
+	else if (slr_char == "LCG_CGS") slr_id = LCG_CGS;
+	else if (slr_char == "LCG_BICGSTAB") slr_id = LCG_BICGSTAB;
+	else if (slr_char == "LCG_BICGSTAB2") slr_id = LCG_BICGSTAB2;
+	else if (slr_char == "LCG_PG") slr_id = LCG_PG;
+	else if (slr_char == "LCG_SPG") slr_id = LCG_SPG;
+	else throw std::invalid_argument("Invalid solver type.");
+	return slr_id;
+}
+
+void lcg_error_str(int er_index, bool er_throw)
+{
+#if defined _WINDOWS || __WIN32__
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_GREEN);
+			std::cerr << "Success! ";
+		}
+		else
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_RED);
+			std::cerr << "Fail! ";
+		}
+	}
+#else
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+			std::cerr << "\033[1m\033[32mSuccess! ";
+		else
+			std::cerr << "\033[1m\033[31mFail! ";
+	}
+#endif
+
+	std::string err_str;
+	switch (er_index)
+	{
+		case LCG_SUCCESS:
+			err_str = "Iteration reached convergence."; break;
+		case LCG_STOP:
+			err_str = "Iteration is stopped by the progress evaluation function."; break;
+		case LCG_ALREADY_OPTIMIZIED:
+			err_str = "The variables are already optimized."; break;
+		case LCG_UNKNOWN_ERROR:
+			err_str = "Unknown error."; break;
+		case LCG_INVILAD_VARIABLE_SIZE:
+			err_str = "The size of the variables is negative."; break;
+		case LCG_INVILAD_MAX_ITERATIONS:
+			err_str = "The maximal iteration times can't be negative."; break;
+		case LCG_INVILAD_EPSILON:
+			err_str = "The epsilon is not in the range (0, 1)."; break;
+		case LCG_INVILAD_RESTART_EPSILON:
+			err_str = "The restart threshold can't be negative."; break;
+		case LCG_REACHED_MAX_ITERATIONS:
+			err_str = "The maximal iteration has been reached."; break;
+		case LCG_NULL_PRECONDITION_MATRIX:
+			err_str = "The precondition matrix can't be null."; break;
+		case LCG_NAN_VALUE:
+			err_str = "The model values are NaN."; break;
+		case LCG_INVALID_POINTER:
+			err_str = "Invalid pointer."; break;
+		case LCG_INVALID_LAMBDA:
+			err_str = "Invalid value for lambda."; break;
+		case LCG_INVALID_SIGMA:
+			err_str = "Invalid value for sigma."; break;
+		case LCG_INVALID_BETA:
+			err_str = "Invalid value for beta."; break;
+		case LCG_INVALID_MAXIM:
+			err_str = "Invalid value for maxi_m."; break;
+		case LCG_SIZE_NOT_MATCH:
+			err_str = "The sizes of solution and target do not match."; break;
+		default:
+			err_str = "Unknown error."; break;
+	}
+
+	if (er_throw && er_index < 0) throw  std::runtime_error(err_str.c_str());
+	else std::cerr << err_str;
+
+#if defined _WINDOWS || __WIN32__
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
+			std::cerr << std::endl;
+		}
+		else
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
+			std::cerr << std::endl;
+		}	
+	}
+#else
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+			std::cerr << "\033[0m" << std::endl;
+		else
+			std::cerr << "\033[0m" << std::endl;	
+	}
+#endif
+
+	return;
+}
+
+
+clcg_para clcg_default_parameters()
+{
+	clcg_para param = defparam2;
+	return param;
+}
+
+clcg_solver_enum clcg_select_solver(std::string slr_char)
+{
+	clcg_solver_enum slr_id;
+	if (slr_char == "CLCG_BICG") slr_id = CLCG_BICG;
+	else if (slr_char == "CLCG_BICG_SYM") slr_id = CLCG_BICG_SYM;
+	else if (slr_char == "CLCG_CGS") slr_id = CLCG_CGS;
+	else if (slr_char == "CLCG_TFQMR") slr_id = CLCG_TFQMR;
+	else throw std::invalid_argument("Invalid solver type.");
+	return slr_id;
+}
+
+void clcg_error_str(int er_index, bool er_throw)
+{
+#if defined _WINDOWS || __WIN32__
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_GREEN);
+			std::cerr << "Success! ";
+		}
+		else
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_RED);
+			std::cerr << "Fail! ";
+		}	
+	}
+#else
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+			std::cerr << "\033[1m\033[32mSuccess! ";
+		else
+			std::cerr << "\033[1m\033[31mFail! ";
+	}
+#endif
+
+	std::string err_str;
+	switch (er_index)
+	{
+		case CLCG_SUCCESS:
+			err_str = "Iteration reached convergence."; break;
+		case CLCG_STOP:
+			err_str = "Iteration is stopped by the progress evaluation function."; break;
+		case CLCG_ALREADY_OPTIMIZIED:
+			err_str = "The variables are already optimized."; break;
+		case CLCG_UNKNOWN_ERROR:
+			err_str = "Unknown error."; break;
+		case CLCG_INVILAD_VARIABLE_SIZE:
+			err_str = "The size of the variables is negative."; break;
+		case CLCG_INVILAD_MAX_ITERATIONS:
+			err_str = "The maximal iteration times is negative."; break;
+		case CLCG_INVILAD_EPSILON:
+			err_str = "The epsilon is not in the range (0, 1)."; break;
+		case CLCG_REACHED_MAX_ITERATIONS:
+			err_str = "The maximal iteration has been reached."; break;
+		case CLCG_NAN_VALUE:
+			err_str = "The model values are NaN."; break;
+		case CLCG_INVALID_POINTER:
+			err_str = "Invalid pointer."; break;
+		case CLCG_SIZE_NOT_MATCH:
+			err_str = "The sizes of the solution and target do not match."; break;
+		case CLCG_UNKNOWN_SOLVER:
+			err_str = "Unknown solver."; break;
+		default:
+			err_str = "Unknown error."; break;
+	}
+
+	if (er_throw && er_index < 0) throw std::runtime_error(err_str.c_str());
+	else std::cerr << err_str;
+
+#if defined _WINDOWS || __WIN32__
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
+			std::cerr << std::endl;
+		}
+		else
+		{
+			SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7);
+			std::cerr << std::endl;
+		}	
+	}
+#else
+	if (!er_throw)
+	{
+		if (er_index >= 0)
+			std::cerr << "\033[0m" << std::endl;
+		else
+			std::cerr << "\033[0m" << std::endl;	
+	}
+#endif
+
+	return;
+}
--- a/src/lib/util.h
+++ b/src/lib/util.h
@@ -0,0 +1,308 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#ifndef _LCG_UTIL_H
+#define _LCG_UTIL_H
+
+#include "string"
+#include "algebra.h"
+
+/**
+ * @brief      Types of method that could be recognized by the lcg_solver() function.
+ */
+enum lcg_solver_enum
+{
+	/**
+	 * Conjugate gradient method.
+	 */
+	LCG_CG,
+	/**
+	 * Preconditioned conjugate gradient method.
+	 */
+	LCG_PCG,
+	/**
+	 * Conjugate gradient squared method.
+	 */
+	LCG_CGS,
+	/**
+	 * Biconjugate gradient method.
+	 */
+	LCG_BICGSTAB,
+	/**
+	 * Biconjugate gradient method with restart.
+	 */
+	LCG_BICGSTAB2,
+	/**
+	 * Conjugate gradient method with projected gradient for inequality constraints.
+	 * This algorithm comes without non-monotonic linear search for the step length.
+	 */
+	LCG_PG,
+	/**
+	 * Conjugate gradient method with spectral projected gradient for inequality constraints.
+	 * This algorithm comes with non-monotonic linear search for the step length.
+	 */
+	LCG_SPG,
+};
+
+/**
+ * @brief      return value of the lcg_solver() function
+ */
+enum lcg_return_enum
+{
+	LCG_SUCCESS = 0, ///< The solver function terminated successfully.
+	LCG_CONVERGENCE = 0, ///< The iteration reached convergence.
+	LCG_STOP, ///< The iteration is stopped by the monitoring function.
+	LCG_ALREADY_OPTIMIZIED, ///< The initial solution is already optimized.
+	// A negative number means a error
+	LCG_UNKNOWN_ERROR = -1024, ///< Unknown error.
+	LCG_INVILAD_VARIABLE_SIZE, ///< The variable size is negative
+	LCG_INVILAD_MAX_ITERATIONS, ///< The maximal iteration times is negative.
+	LCG_INVILAD_EPSILON, ///< The epsilon is negative.
+	LCG_INVILAD_RESTART_EPSILON, ///< The restart epsilon is negative.
+	LCG_REACHED_MAX_ITERATIONS, ///< Iteration reached maximal limit.
+	LCG_NULL_PRECONDITION_MATRIX, ///< Null precondition matrix.
+	LCG_NAN_VALUE, ///< Nan value.
+	LCG_INVALID_POINTER, ///< Invalid pointer.
+	LCG_INVALID_LAMBDA, ///< Invalid range for lambda.
+	LCG_INVALID_SIGMA, ///< Invalid range for sigma.
+	LCG_INVALID_BETA, ///< Invalid range for beta.
+	LCG_INVALID_MAXIM, ///< Invalid range for maxi_m.
+	LCG_SIZE_NOT_MATCH, ///< Sizes of m and B do not match
+};
+
+/**
+ * @brief      Parameters of the conjugate gradient methods.
+ */
+struct lcg_para
+{
+	/**
+	 * Maximal iteration times. The process will continue till the convergence is met
+	 * if this option is set to zero (default).
+	*/
+	int max_iterations;
+
+	/**
+	 * Epsilon for convergence test.
+	 * This parameter determines the accuracy with which the solution is to be 
+	 * found. A minimization terminates when ||g||/max(||g0||, 1.0) <= epsilon or 
+	 * sqrt(||g||)/N <= epsilon for the lcg_solver() function, where ||.|| denotes 
+	 * the Euclidean (L2) norm. The default value of epsilon is 1e-8.
+	*/
+	lcg_float epsilon;
+
+	/**
+	 * Whether to use absolute mean differences (AMD) between |Ax - B| to evaluate the process. 
+	 * The default value is false which means the gradient based evaluating method is used. 
+	 * The AMD based method will be used if this variable is set to true. This parameter is only 
+	 * applied to the non-constrained methods.
+	 */
+	int abs_diff;
+
+	/**
+	 * Restart epsilon for the LCG_BICGSTAB2 algorithm. The default value is 1e-6
+	 */
+	lcg_float restart_epsilon;
+
+	/**
+	 * Initial step length for the project gradient method. The default is 1.0
+	 */
+	lcg_float step;
+
+	/**
+	 * multiplier for updating solutions with the spectral projected gradient method. The range of
+	 * this variable is (0, 1). The default is given as 0.95
+	 */
+	lcg_float sigma;
+
+	/**
+	 * descending ratio for conducting the non-monotonic linear search. The range of
+	 * this variable is (0, 1). The default is given as 0.9
+	 */
+	lcg_float beta;
+
+	/**
+	 * The maximal record times of the objective values for the SPG method. The method use the 
+	 * objective values from the most recent maxi_m times to preform the non-monotonic linear search.
+	 * The default value is 10.
+	 */
+	int maxi_m;
+};
+
+/**
+ * Default parameter for conjugate gradient methods
+ */
+static const lcg_para defparam = {0, 1e-8, 0, 1e-6, 1.0, 0.95, 0.9, 10};
+
+/**
+ * @brief      Return a lcg_para type instance with default values.
+ * 
+ * Users can use this function to get default parameters' value for the conjugate gradient methods.
+ * 
+ * @return     A lcg_para type instance.
+ */
+lcg_para lcg_default_parameters();
+
+/**
+ * @brief      Select a type of solver according to the name
+ *
+ * @param[in]  slr_char  Name of the solver
+ *
+ * @return     The lcg solver enum.
+ */
+lcg_solver_enum lcg_select_solver(std::string slr_char);
+
+/**
+ * @brief      Display or throw out a string explanation for the lcg_solver() function's return values.
+ *
+ * @param[in]  er_index  The error index returned by the lcg_solver() function.
+ * @param[in]  er_throw  throw out a char string of the explanation.
+ *
+ * @return     A string explanation of the error.
+ */
+void lcg_error_str(int er_index, bool er_throw = false);
+
+
+/**
+ * @brief      Types of method that could be recognized by the clcg_solver() function.
+ */
+enum clcg_solver_enum
+{
+	/**
+	 * Jacob's Bi-Conjugate Gradient Method
+	 */
+	CLCG_BICG,
+	/**
+	 * Bi-Conjugate Gradient Method accelerated for complex symmetric A
+	 */
+	CLCG_BICG_SYM,
+	/**
+	 * Conjugate Gradient Squared Method with real coefficients.
+	 */
+	CLCG_CGS,
+	/**
+	 * Biconjugate gradient method.
+	 */
+	CLCG_BICGSTAB,
+	/**
+	 * Quasi-Minimal Residual Method
+	 */
+	//CLCG_QMR,
+	/**
+	 * Transpose Free Quasi-Minimal Residual Method
+	 */
+	CLCG_TFQMR,
+	/**
+	 * Preconditioned conjugate gradient
+	 */
+	CLCG_PCG,
+	/**
+	 * Preconditioned Bi-Conjugate Gradient Method
+	 */
+	CLCG_PBICG,
+};
+
+/**
+ * @brief      return value of the clcg_solver() function
+ */
+enum clcg_return_enum
+{
+	CLCG_SUCCESS = 0, ///< The solver function terminated successfully.
+	CLCG_CONVERGENCE = 0, ///< The iteration reached convergence.
+	CLCG_STOP, ///< The iteration is stopped by the monitoring function.
+	CLCG_ALREADY_OPTIMIZIED, ///< The initial solution is already optimized.
+	// A negative number means a error
+	CLCG_UNKNOWN_ERROR = -1024, ///< Unknown error.
+	CLCG_INVILAD_VARIABLE_SIZE, ///< The variable size is negative
+	CLCG_INVILAD_MAX_ITERATIONS, ///< The maximal iteration times is negative.
+	CLCG_INVILAD_EPSILON, ///< The epsilon is negative.
+	CLCG_REACHED_MAX_ITERATIONS, ///< Iteration reached maximal limit.
+	CLCG_NAN_VALUE, ///< Nan value.
+	CLCG_INVALID_POINTER, ///< Invalid pointer.
+	CLCG_SIZE_NOT_MATCH, ///< Sizes of m and B do not match
+	CLCG_UNKNOWN_SOLVER, ///< Unknown solver
+};
+
+/**
+ * @brief      Parameters of the conjugate gradient methods.
+ */
+struct clcg_para
+{
+	/**
+	 * Maximal iteration times. The process will continue till the convergence is met
+	 * if this option is set to zero (default).
+	*/
+	int max_iterations;
+
+	/**
+	 * Epsilon for convergence test.
+	 * This parameter determines the accuracy with which the solution is to be found. 
+	 * A minimization terminates when ||g||/max(||g0||, 1.0) <= epsilon or sqrt(||g||)/N 
+	 * <= epsilon for the lcg_solver() function, where ||.|| denotes the Euclidean (L2) norm. 
+	 * The default value of epsilon is 1e-8. For box-constrained methods,the convergence test 
+	 * is implemented using ||P(m-g) - m|| <= epsilon, in which P is the projector that 
+	 * transfers m into the constrained domain.
+	*/
+	lcg_float epsilon;
+
+	/**
+	 * Whether to use absolute mean differences (AMD) between |Ax - B| to evaluate the process. 
+	 * The default value is false which means the gradient based evaluating method is used. 
+	 * The AMD based method will be used if this variable is set to true. This parameter is only 
+	 * applied to the non-constrained methods.
+	 */
+	int abs_diff;
+};
+
+/**
+ * Default parameter for conjugate gradient methods
+ */
+static const clcg_para defparam2 = {0, 1e-8, 0};
+
+/**
+ * @brief      Return a clcg_para type instance with default values.
+ * 
+ * Users can use this function to get default parameters' value for the complex conjugate gradient methods.
+ * 
+ * @return     A clcg_para type instance.
+ */
+clcg_para clcg_default_parameters();
+
+/**
+ * @brief      Select a type of solver according to the name
+ *
+ * @param[in]  slr_char  Name of the solver
+ *
+ * @return     The clcg solver enum.
+ */
+clcg_solver_enum clcg_select_solver(std::string slr_char);
+
+/**
+ * @brief      Display or throw out a string explanation for the clcg_solver() function's return values.
+ *
+ * @param[in]  er_index  The error index returned by the lcg_solver() function.
+ * @param[in]  er_throw  throw out a char string of the explanation.
+ *
+ * @return     A string explanation of the error.
+ */
+void clcg_error_str(int er_index, bool er_throw = false);
+
+#endif // _LCG_UTIL_H
--- a/src/sample/sample1.cpp
+++ b/src/sample/sample1.cpp
@@ -0,0 +1,167 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "cmath"
+#include "iostream"
+#include "../lib/lcg.h"
+
+#define M 100
+#define N 80
+
+lcg_float max_diff(const lcg_float *a, const lcg_float *b, int size)
+{
+	lcg_float max = -1;
+	for (int i = 0; i < size; i++)
+	{
+		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
+	}
+	return max;
+}
+
+// 普通二维数组做核矩阵
+lcg_float **kernel;
+// 中间结果数组
+lcg_float *tmp_arr;
+// 预优矩阵
+lcg_float *p;
+
+// 计算核矩阵乘向量的乘积
+void CalAx(void* instance, const lcg_float* x, lcg_float* prod_Ax, const int n_s)
+{
+	lcg_matvec(kernel, x, tmp_arr, M, n_s, MatNormal);
+	lcg_matvec(kernel, tmp_arr, prod_Ax, M, n_s, MatTranspose);
+	return;
+}
+
+void CalMx(void* instance, const lcg_float* x, lcg_float* prod_Mx, const int n_s)
+{
+	for (size_t i = 0; i < n_s; i++)
+	{
+		prod_Mx[i] = p[i]*x[i];
+	}
+	return;
+}
+
+//定义共轭梯度监控函数
+int Prog(void* instance, const lcg_float* m, const lcg_float converge, const lcg_para* param, const int n_s, const int k)
+{
+	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	return 0;
+}
+
+int main(int argc, char const *argv[])
+{
+	kernel = lcg_malloc(M, N);
+	tmp_arr = lcg_malloc(M);
+	p = lcg_malloc(N);
+
+	lcg_vecrnd(kernel, -1.0, 1.0, M, N);
+
+	// 生成一组正演解
+	lcg_float *fm = lcg_malloc(N);
+	lcg_vecrnd(fm, 1.0, 2.0, N);
+
+	// 计算共轭梯度B项
+	lcg_float *B = lcg_malloc(N);
+	lcg_matvec(kernel, fm, tmp_arr, M, N, MatNormal);
+	lcg_matvec(kernel, tmp_arr, B, M, N, MatTranspose);
+
+	/********************准备工作完成************************/
+	lcg_para self_para = lcg_default_parameters();
+	self_para.epsilon = 1e-7;
+	self_para.abs_diff = 0;
+
+	// 声明一组解
+	lcg_float *m = lcg_malloc(N);
+	lcg_vecset(m, 0.0, N);
+
+	// 声明一组预优因子
+	lcg_float diag;
+	for (size_t i = 0; i < N; i++)
+	{
+		diag = 0.0;
+		for (size_t j = 0; j < M; j++)
+		{
+			diag += kernel[j][i]*kernel[j][i];
+		}
+		p[i] = 1.0/diag;
+	}
+
+	// 约束解的范围
+	lcg_float *low = lcg_malloc(N);
+	lcg_float *hig = lcg_malloc(N);
+	lcg_vecset(low, 1.0, N);
+	lcg_vecset(hig, 2.0, N);
+
+	int ret;
+
+	std::clog << "solver: cg" << std::endl;
+	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_CG);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	std::clog << "solver: pcg" << std::endl;
+	ret = lcg_solver_preconditioned(CalAx, CalMx, Prog, m, B, N, &self_para, NULL, LCG_PCG);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	std::clog << "solver: cgs" << std::endl;
+	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_CGS);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	std::clog << "solver: bicgstab" << std::endl;
+	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_BICGSTAB);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	std::clog << "solver: bicgstab2" << std::endl;
+	ret = lcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, LCG_BICGSTAB2);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	std::clog << "solver: pg" << std::endl;
+	ret = lcg_solver_constrained(CalAx, Prog, m, B, low, hig, N, &self_para, NULL, LCG_PG);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	std::clog << "solver: spg" << std::endl;
+	ret = lcg_solver_constrained(CalAx, Prog, m, B, low, hig, N, &self_para, NULL, LCG_SPG);
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_free(kernel, M);
+	lcg_free(tmp_arr);
+	lcg_free(fm);
+	lcg_free(B);
+	lcg_free(m);
+	lcg_free(p);
+	lcg_free(low);
+	lcg_free(hig);
+	return 0;
+}
--- a/src/sample/sample10.cu
+++ b/src/sample/sample10.cu
@@ -0,0 +1,318 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/solver_cuda.h"
+
+// Declare as global variables
+cuDoubleComplex one = {1.0, 0.0};
+cuDoubleComplex zero = {0.0, 0.0};
+
+void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
+	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new cuDoubleComplex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new cuDoubleComplex[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
+	}
+
+	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new cuDoubleComplex[*pN]{};
+
+	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
+{
+	lcg_float avg = 0.0;
+	cuDoubleComplex tmp;
+	for (size_t i = 0; i < n; i++)
+	{
+		tmp = clcg_Zdiff(a[i], b[i]);
+		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
+	}
+	return sqrt(avg)/n;
+}
+
+class sample10 : public CLCG_CUDA_Solver
+{
+public:
+	sample10(){}
+	virtual ~sample10(){}
+
+	void solve(std::string inputPath, std::string answerPath);
+
+	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
+	cusparseOperation_t oper_t)
+	{
+		// Calculate the product of A*x
+		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
+		return;
+	}
+
+	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+		cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
+		cusparseOperation_t oper_t)
+	{
+		void *d_x, *d_Ax;
+		cusparseDnVecGetValues(x, &d_x);
+		cusparseDnVecGetValues(prod_Ax, &d_Ax);
+
+		if (use_incomplete_cholesky)
+		{
+			cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (cuDoubleComplex*) d_x, (cuDoubleComplex*) d_pd, 
+				CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+			cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (cuDoubleComplex*) d_pd, (cuDoubleComplex*) d_Ax, 
+				CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+		}
+		else
+		{
+			clcg_vecDvecZ_element_wise((cuDoubleComplex*) d_x, d_pd, (cuDoubleComplex*) d_Ax, n_size);
+		}	
+		return;
+	}
+
+private:
+	bool use_incomplete_cholesky;
+
+	int N, nz;
+	int *rowIdxA, *colIdxA;
+	cuDoubleComplex *A, *b;
+	cuDoubleComplex *ans_x;
+
+	void *d_buf;
+	cusparseSpMatDescr_t smat_A;
+
+	int *d_rowIdxA; // COO
+	int *d_rowPtrA; // CSR
+	int *d_colIdxA;
+	cuDoubleComplex *d_A;
+	cuDoubleComplex *d_pd;
+	cuDoubleComplex *d_ic;
+
+	cusparseMatDescr_t descr_A;
+	cusparseMatDescr_t descr_L;
+	csric02Info_t icinfo_A;
+	csrsv2Info_t info_L;
+	csrsv2Info_t info_LT;
+
+	cuDoubleComplex *host_m;
+	cusparseDnVecDescr_t dvec_tmp;
+};
+
+void sample10::solve(std::string inputPath, std::string answerPath)
+{
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	// Create handles
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	// Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_pd, N * sizeof(cuDoubleComplex));
+
+	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+	// This is just used to get bufferSize;
+	cusparseDnVecDescr_t dvec_tmp;
+	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_C_64F);
+
+	size_t bufferSize_B;
+	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
+
+	// --- Start of the preconditioning part ---
+	// Get the diagonal elemenets
+	clcg_smZcsr_get_diagonal(d_rowPtrA, d_colIdxA, d_A, N, d_pd);
+
+	// Copy A
+	cudaMalloc(&d_ic, nz * sizeof(cuDoubleComplex));
+	cudaMemcpy(d_ic, d_A, nz * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice);
+
+	// create descriptor for matrix A
+	cusparseCreateMatDescr(&descr_A);
+
+	// initialize properties of matrix A
+	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_SYMMETRIC);
+	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
+	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
+
+	// create descriptor for matrix L
+	cusparseCreateMatDescr(&descr_L);
+
+	// initialize properties of matrix L
+	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create empty info objects for incomplete-cholesky factorization
+	cusparseCreateCsric02Info(&icinfo_A);
+	cusparseCreateCsrsv2Info(&info_L);
+	cusparseCreateCsrsv2Info(&info_LT);
+
+	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT;
+	bufferSize = bufferSize_B;
+
+	// Compute buffer size in computing ic factorization
+	cusparseZcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
+		d_colIdxA, icinfo_A, &bufferSize_A);
+	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
+	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT);
+	
+	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT);
+	cudaMalloc(&d_buf, bufferSize);
+
+	// Perform incomplete-choleskey factorization: analysis phase
+	cusparseZcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, 
+		d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+	// Perform incomplete-choleskey factorization: solve phase
+	cusparseZcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA, 
+		icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	// --- End of the preconditioning part ---
+
+	// Declare an initial solution
+	host_m = new cuDoubleComplex[N];
+
+    clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-6;
+
+	// Preconditioning with Diagonal elements
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+	use_incomplete_cholesky = false;
+	MinimizePreconditioned(cubHandle, cusHandle, host_m, b, N, nz, CLCG_PCG);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+	
+	// Preconditioning with incomplete-Cholesky factorization
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+	use_incomplete_cholesky = true;
+	MinimizePreconditioned(cubHandle, cusHandle, host_m, b, N, nz, CLCG_PCG);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	delete[] host_m;
+
+	// Free Device memory
+	cudaFree(d_A);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+	cudaFree(d_pd);
+	cudaFree(d_ic);
+
+	cusparseDestroyDnVec(dvec_tmp);
+	cusparseDestroySpMat(smat_A);
+	cudaFree(d_buf);
+
+	cusparseDestroyMatDescr(descr_A);
+	cusparseDestroyMatDescr(descr_L);
+	cusparseDestroyCsric02Info(icinfo_A);
+	cusparseDestroyCsrsv2Info(info_L);
+	cusparseDestroyCsrsv2Info(info_LT);
+
+	// Free handles
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+	return;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_10K_cA";
+	std::string answerPath = "data/case_10K_cB";
+
+	sample10 sp;
+	sp.set_report_interval(0);
+	sp.solve(inputPath, answerPath);
+	return 0;
+}
--- a/src/sample/sample11.cu
+++ b/src/sample/sample11.cu
@@ -0,0 +1,299 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/clcg_cuda.h"
+
+void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
+	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new cuDoubleComplex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new cuDoubleComplex[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
+	}
+
+	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new cuDoubleComplex[*pN]{};
+
+	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
+{
+	lcg_float avg = 0.0;
+	cuDoubleComplex tmp;
+	for (size_t i = 0; i < n; i++)
+	{
+		tmp = clcg_Zdiff(a[i], b[i]);
+		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
+	}
+	return sqrt(avg)/n;
+}
+
+// Declare as global variables
+cuDoubleComplex one, zero;
+
+void *d_buf;
+cusparseSpMatDescr_t smat_A;
+
+int *d_rowIdxA; // COO
+int *d_rowPtrA; // CSR
+int *d_colIdxA;
+cuDoubleComplex *d_A;
+cuDoubleComplex *d_pd;
+cuDoubleComplex *d_iu;
+
+cusparseMatDescr_t descr_A = 0;
+cusparseMatDescr_t descr_L = 0;
+cusparseMatDescr_t descr_U = 0;
+csrilu02Info_t info_ILU = 0;
+csrsv2Info_t info_L = 0;
+csrsv2Info_t info_U = 0;
+
+void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
+	cusparseOperation_t oper_t)
+{
+	one.x = 1.0; one.y = 0.0;
+	zero.x = 0.0; zero.y = 0.0;
+	// Calculate the product of A*x
+	//cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
+	cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
+    return;
+}
+
+void cudaMx_ILU(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
+	cusparseOperation_t oper_t)
+{
+	void *d_x, *d_Ax;
+	cusparseDnVecGetValues(x, &d_x);
+	cusparseDnVecGetValues(prod_Ax, &d_Ax);
+
+	one.x = 1.0; one.y = 0.0;
+	cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_size, nz_size, &one, descr_L, d_iu, d_rowPtrA, d_colIdxA, info_L, (cuDoubleComplex*) d_x, (cuDoubleComplex*) d_pd, 
+		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+	cusparseZcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_size, nz_size, &one, descr_U, d_iu, d_rowPtrA, d_colIdxA, info_U, (cuDoubleComplex*) d_pd, (cuDoubleComplex*) d_Ax, 
+		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+    return;
+}
+
+int cudaProgress(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
+	const clcg_para* param, const int n_size, const int nz_size, const int k)
+{
+    if (converge <= param->epsilon) {
+		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_1M_cA";
+	std::string answerPath = "data/case_1M_cB";
+
+	int N;
+	int nz;
+	cuDoubleComplex *A;
+	int *rowIdxA;
+	int *colIdxA;
+	cuDoubleComplex *b;
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	cuDoubleComplex *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	// Create handles
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	// Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_pd, N * sizeof(cuDoubleComplex));
+
+	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+	// This is just used to get bufferSize;
+	cusparseDnVecDescr_t dvec_tmp;
+	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_C_64F);
+
+	size_t bufferSize_B;
+	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
+
+	// --- Start of the preconditioning part ---
+	// Copy A
+	cudaMalloc(&d_iu, nz * sizeof(cuDoubleComplex));
+	cudaMemcpy(d_iu, d_A, nz * sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice);
+
+	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_U;
+	bufferSize = bufferSize_B;
+
+	// create descriptor for matrix A
+	cusparseCreateMatDescr(&descr_A);
+
+	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
+
+	// create descriptor for matrix L
+	cusparseCreateMatDescr(&descr_L);
+
+	// initialize properties of matrix L
+	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
+	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
+
+	// create descriptor for matrix U
+	cusparseCreateMatDescr(&descr_U);
+
+	cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
+	cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	cusparseSetMatIndexBase(descr_U, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create empty info objects for incomplete-cholesky factorization
+	cusparseCreateCsrilu02Info(&info_ILU);
+	cusparseCreateCsrsv2Info(&info_L);
+	cusparseCreateCsrsv2Info(&info_U);
+
+	// Compute buffer size in computing ic factorization
+	cusparseZcsrilu02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
+		d_colIdxA, info_ILU, &bufferSize_A);
+	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_L, d_iu, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
+	cusparseZcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_U, d_iu, d_rowPtrA, d_colIdxA, info_U, &bufferSize_U);
+	
+	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_U);
+	cudaMalloc(&d_buf, bufferSize);
+
+	// Perform incomplete-choleskey factorization: analysis phase
+	cusparseZcsrilu02_analysis(cusHandle, N, nz, descr_A, d_iu, d_rowPtrA, 
+		d_colIdxA, info_ILU, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_L, d_iu, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	cusparseZcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_U, d_iu, d_rowPtrA, d_colIdxA, info_U, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+	// Perform incomplete-choleskey factorization: solve phase
+	cusparseZcsrilu02(cusHandle, N, nz, descr_A, d_iu, d_rowPtrA, d_colIdxA, 
+		info_ILU, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	// --- End of the preconditioning part ---
+
+	// Declare an initial solution
+    clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	int ret;
+	cuDoubleComplex *host_m = new cuDoubleComplex[N];
+
+	// Preconditioning with incomplete-LU factorization
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+	ret = clcg_solver_preconditioned_cuda(cudaAx, cudaMx_ILU, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, CLCG_PCG);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	delete[] host_m;
+
+	// Free Device memory
+	cudaFree(d_A);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+	cudaFree(d_pd);
+	cudaFree(d_iu);
+
+	cusparseDestroyDnVec(dvec_tmp);
+	cusparseDestroySpMat(smat_A);
+	cudaFree(d_buf);
+
+	cusparseDestroyMatDescr(descr_A);
+	cusparseDestroyMatDescr(descr_L);
+	cusparseDestroyMatDescr(descr_U);
+	cusparseDestroyCsrilu02Info(info_ILU);
+	cusparseDestroyCsrsv2Info(info_L);
+	cusparseDestroyCsrsv2Info(info_U);
+
+	// Free handles
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+
+	return 0;
+}
--- a/src/sample/sample12.cu
+++ b/src/sample/sample12.cu
@@ -0,0 +1,306 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/solver_cuda.h"
+#include "../lib/preconditioner_cuda.h"
+
+// Declare as global variables
+cuDoubleComplex one = {1.0, 0.0};
+cuDoubleComplex zero = {0.0, 0.0};
+
+void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
+	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new cuDoubleComplex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new cuDoubleComplex[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
+	}
+
+	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new cuDoubleComplex[*pN]{};
+
+	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
+{
+	lcg_float avg = 0.0;
+	cuDoubleComplex tmp;
+	for (size_t i = 0; i < n; i++)
+	{
+		tmp = clcg_Zdiff(a[i], b[i]);
+		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
+	}
+	return sqrt(avg)/n;
+}
+
+class sample12 : public CLCG_CUDA_Solver
+{
+public:
+	sample12(){}
+	virtual ~sample12(){}
+
+	void solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		// Calculate the product of A*x
+		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
+		return;
+	}
+
+	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_IC, x, dvec_p, 
+			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L);
+		
+		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_IC, dvec_p, prod_Ax, 
+			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT);
+		return;
+	}
+
+private:
+	int N, nz;
+	int *rowIdxA, *colIdxA;
+	cuDoubleComplex *A, *b;
+	cuDoubleComplex *ans_x;
+
+	int *IC_row, *IC_col;
+    cuDoubleComplex *IC_val;
+
+	void *d_buf, *d_buf2;
+	cusparseSpMatDescr_t smat_A;
+	cusparseSpMatDescr_t smat_IC;
+	cusparseSpSVDescr_t descr_L, descr_LT;
+
+	int *d_rowIdxA; // COO
+	int *d_rowPtrA; // CSR
+	int *d_colIdxA;
+	cuDoubleComplex *d_A;
+	cuDoubleComplex *d_p;
+	cusparseDnVecDescr_t dvec_p;
+
+	int *d_rowIdxIC; // COO
+	int *d_rowPtrIC; // CSR
+	int *d_colIdxIC;
+	cuDoubleComplex *d_IC;
+
+	cuDoubleComplex *host_m;
+	cuDoubleComplex *d_t;
+	cusparseDnVecDescr_t dvec_tmp;
+};
+
+void sample12::solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	IC_row = new int [nz];
+    IC_col = new int [nz];
+    IC_val = new cuDoubleComplex [nz];
+
+    clcg_incomplete_Cholesky_cuda_full(rowIdxA, colIdxA, A, N, nz, IC_row, IC_col, IC_val);
+/*
+	for (size_t i = 0; i < nz; i++)
+	{
+		if (IC_row[i] >= IC_col[i])
+		{
+			std::cout << IC_row[i] << " " << IC_col[i] << " (" << IC_val[i].x << "," << IC_val[i].y << ")\n";	
+		}
+	}
+*/
+    // Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_p, N * sizeof(cuDoubleComplex));
+    cusparseCreateDnVec(&dvec_p, N, d_p, CUDA_C_64F);
+
+	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+    cudaMalloc(&d_IC, nz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxIC, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrIC, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxIC, nz * sizeof(int));
+
+    cudaMemcpy(d_IC, IC_val, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxIC, IC_row, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxIC, IC_col, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cus_handle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+	// Convert matrix L from COO format to CSR format
+    cusparseXcoo2csr(cus_handle, d_rowIdxIC, nz, N, d_rowPtrIC, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+    cusparseCreateCsr(&smat_IC, N, N, nz, d_rowPtrIC, d_colIdxIC, d_IC, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+	// Specify Non-Unit diagonal type.
+    //cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
+	//cusparseSpMatSetAttribute(smat_IC, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype));
+
+    // This is just used to get bufferSize;
+	cudaMalloc(&d_t, N * sizeof(cuDoubleComplex));
+	cusparseCreateDnVec(&dvec_tmp, N, d_t, CUDA_C_64F);
+
+	size_t bufferSize_B;
+	cusparseSpMV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize_B);
+
+    // --- Start of the preconditioning part ---
+    cusparseSpSV_createDescr(&descr_L);
+    cusparseSpSV_createDescr(&descr_LT);
+
+    size_t bufferSize, bufferSize_L, bufferSize_LT;
+	bufferSize = bufferSize_B;
+
+    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_IC, dvec_p, 
+        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, &bufferSize_L);
+    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_IC, dvec_p, 
+        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, &bufferSize_LT);
+
+    bufferSize = max(max(bufferSize, bufferSize_L), bufferSize_LT);
+	cudaMalloc(&d_buf, bufferSize);
+	cudaMalloc(&d_buf2, bufferSize);
+
+	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_IC, dvec_tmp, dvec_p, 
+		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, d_buf);
+
+	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_IC, dvec_p, dvec_tmp, 
+		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, d_buf2);
+
+	// --- End of the preconditioning part ---
+
+	// Declare an initial solution
+    clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	host_m = new cuDoubleComplex[N];
+
+	// Preconditioning with incomplete-chelosky factorization
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+	MinimizePreconditioned(cub_handle, cus_handle, host_m, b, N, nz, CLCG_PCG);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	if (rowIdxA != nullptr) delete[] rowIdxA;
+	if (colIdxA != nullptr) delete[] colIdxA;
+    if (A != nullptr) delete[] A;
+	if (b != nullptr) delete[] b;
+	if (ans_x != nullptr) delete[] ans_x;
+
+    if (IC_row != nullptr) delete[] IC_row;
+    if (IC_col != nullptr) delete[] IC_col;
+    if (IC_val != nullptr) delete[] IC_val;
+
+    if (host_m != nullptr) delete[] host_m;
+
+	cusparseDestroyDnVec(dvec_tmp);
+    cusparseDestroyDnVec(dvec_p);
+
+	cudaFree(d_buf);
+	cudaFree(d_buf2);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+    cudaFree(d_A);
+	cudaFree(d_p);
+	cudaFree(d_t);
+
+    cudaFree(d_rowIdxIC);
+	cudaFree(d_rowPtrIC);
+	cudaFree(d_colIdxIC);
+    cudaFree(d_IC);
+
+	cusparseDestroySpMat(smat_A);
+	cusparseDestroySpMat(smat_IC);
+    cusparseSpSV_destroyDescr(descr_L);
+    cusparseSpSV_destroyDescr(descr_LT);
+	return;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_1M_cA";
+	std::string answerPath = "data/case_1M_cB";
+
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	sample12 sp;
+	sp.set_report_interval(0);
+	sp.solve(inputPath, answerPath, cubHandle, cusHandle);
+
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+	return 0;
+}
--- a/src/sample/sample13.cu
+++ b/src/sample/sample13.cu
@@ -0,0 +1,305 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/solver_cuda.h"
+#include "../lib/preconditioner_cuda.h"
+
+// Declare as global variables
+cuDoubleComplex one = {1.0, 0.0};
+cuDoubleComplex zero = {0.0, 0.0};
+
+void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
+	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new cuDoubleComplex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new cuDoubleComplex[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
+	}
+
+	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new cuDoubleComplex[*pN]{};
+
+	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
+{
+	lcg_float avg = 0.0;
+	cuDoubleComplex tmp;
+	for (size_t i = 0; i < n; i++)
+	{
+		tmp = clcg_Zdiff(a[i], b[i]);
+		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
+	}
+	return sqrt(avg)/n;
+}
+
+class sample13 : public CLCG_CUDA_Solver
+{
+public:
+	sample13(){}
+	virtual ~sample13(){}
+
+	void solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		// Calculate the product of A*x
+		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_tuf);
+		return;
+	}
+
+	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, x, dvec_p, 
+			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L);
+		
+		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, prod_Ax, 
+			CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT);
+		return;
+	}
+
+private:
+	int N, nz, lnz;
+	int *rowIdxA, *colIdxA;
+	cuDoubleComplex *A, *b;
+	cuDoubleComplex *ans_x;
+
+	int *L_row, *L_col;
+    cuDoubleComplex *L_val;
+
+	void *d_tuf, *d_tuf2;
+	cusparseSpMatDescr_t smat_A;
+	cusparseSpMatDescr_t smat_L;
+	cusparseSpSVDescr_t descr_L, descr_LT;
+
+	int *d_rowIdxA; // COO
+	int *d_rowPtrA; // CSR
+	int *d_colIdxA;
+	cuDoubleComplex *d_A;
+	cuDoubleComplex *d_t;
+	cuDoubleComplex *d_p;
+	cusparseDnVecDescr_t dvec_p;
+
+	int *d_rowIdxL; // COO
+	int *d_rowPtrL; // CSR
+	int *d_colIdxL;
+	cuDoubleComplex *d_L;
+
+	cuDoubleComplex *host_m;
+	cusparseDnVecDescr_t dvec_tmp;
+};
+
+void sample13::solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+	readAnswer(answerPath, &N, &ans_x);
+
+    clcg_incomplete_Cholesky_cuda_half_buffsize(rowIdxA, colIdxA, nz, &lnz);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+    std::clog << "lnz = " << lnz << std::endl;
+
+	L_row = new int [lnz];
+    L_col = new int [lnz];
+    L_val = new cuDoubleComplex [lnz];
+
+    clcg_incomplete_Cholesky_cuda_half(rowIdxA, colIdxA, A, N, nz, lnz, L_row, L_col, L_val);
+/*
+    for (size_t i = 0; i < lnz; i++)
+    {
+        std::cout << L_row[i] << " " << L_col[i] << " (" << L_val[i].x << "," << L_val[i].y << ")\n";
+    }
+*/
+    // Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_t, N * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_p, N * sizeof(cuDoubleComplex));
+    cusparseCreateDnVec(&dvec_p, N, d_p, CUDA_C_64F);
+
+	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+    cudaMalloc(&d_L, lnz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxL, lnz * sizeof(int));
+	cudaMalloc(&d_rowPtrL, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxL, lnz * sizeof(int));
+
+    cudaMemcpy(d_L, L_val, lnz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxL, L_row, lnz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxL, L_col, lnz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cus_handle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+	// Convert matrix L from COO format to CSR format
+    cusparseXcoo2csr(cus_handle, d_rowIdxL, lnz, N, d_rowPtrL, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+    cusparseCreateCsr(&smat_L, N, N, lnz, d_rowPtrL, d_colIdxL, d_L, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+    // Specify Lower fill mode.
+    cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_LOWER;
+	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode));
+
+	// Specify Non-Unit diagonal type.
+    cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
+	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype));
+
+    // This is just used to get bufferSize;
+	cusparseCreateDnVec(&dvec_tmp, N, d_t, CUDA_C_64F);
+
+	size_t bufferSize_B;
+	cusparseSpMV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize_B);
+
+    // --- Start of the preconditioning part ---
+    cusparseSpSV_createDescr(&descr_L);
+    cusparseSpSV_createDescr(&descr_LT);
+
+    size_t bufferSize, bufferSize_L, bufferSize_LT;
+	bufferSize = bufferSize_B;
+
+    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_p, 
+        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, &bufferSize_L);
+    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, 
+        dvec_tmp, CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, &bufferSize_LT);
+
+    bufferSize = max(max(bufferSize, bufferSize_L), bufferSize_LT);
+	cudaMalloc(&d_tuf, bufferSize);
+	cudaMalloc(&d_tuf2, bufferSize);
+
+	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_tmp, dvec_p, 
+		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, d_tuf);
+
+	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, dvec_tmp, 
+		CUDA_C_64F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, d_tuf2);
+	// --- End of the preconditioning part ---
+
+	// Declare an initial solution
+    clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	// Preconditioning with incomplete-chelosky factorization
+	host_m = clcg_malloc_cuda(N);
+	clcg_vecset_cuda(host_m, zero, N);
+
+	MinimizePreconditioned(cub_handle, cus_handle, host_m, b, N, nz, CLCG_PCG);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	if (rowIdxA != nullptr) delete[] rowIdxA;
+	if (colIdxA != nullptr) delete[] colIdxA;
+    if (A != nullptr) delete[] A;
+	if (b != nullptr) delete[] b;
+	if (ans_x != nullptr) delete[] ans_x;
+
+    if (L_row != nullptr) delete[] L_row;
+    if (L_col != nullptr) delete[] L_col;
+    if (L_val != nullptr) delete[] L_val;
+
+	clcg_free_cuda(host_m);
+
+	cusparseDestroyDnVec(dvec_tmp);
+    cusparseDestroyDnVec(dvec_p);
+
+	cudaFree(d_tuf);
+	cudaFree(d_tuf2);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+    cudaFree(d_A);
+	cudaFree(d_t);
+	cudaFree(d_p);
+
+    cudaFree(d_rowIdxL);
+	cudaFree(d_rowPtrL);
+	cudaFree(d_colIdxL);
+    cudaFree(d_L);
+
+	cusparseDestroySpMat(smat_A);
+	cusparseDestroySpMat(smat_L);
+    cusparseSpSV_destroyDescr(descr_L);
+    cusparseSpSV_destroyDescr(descr_LT);
+	return;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_10K_cA";
+	std::string answerPath = "data/case_10K_cB";
+
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	sample13 sp;
+	sp.set_report_interval(0);
+	sp.solve(inputPath, answerPath, cubHandle, cusHandle);
+
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+	return 0;
+}
--- a/src/sample/sample14.cu
+++ b/src/sample/sample14.cu
@@ -0,0 +1,327 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/solver_cuda.h"
+#include "../lib/preconditioner_cuda.h"
+
+// Declare as global variables
+cuComplex one = {1.0, 0.0};
+cuComplex zero = {0.0, 0.0};
+
+void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
+	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new cuDoubleComplex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new cuDoubleComplex[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
+	}
+
+	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new cuDoubleComplex[*pN]{};
+
+	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+float avg_error(cuComplex *a, cuComplex *b, int n)
+{
+	float avg = 0.0;
+	cuComplex tmp;
+	for (size_t i = 0; i < n; i++)
+	{
+		tmp = clcg_Cdiff(a[i], b[i]);
+		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
+	}
+	return sqrt(avg)/n;
+}
+
+class sample14 : public CLCG_CUDAF_Solver
+{
+public:
+	sample14(){}
+	virtual ~sample14(){}
+
+	void solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle);
+
+	void AxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		// Calculate the product of A*x
+		cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_32F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
+		return;
+	}
+
+	void MxProduct(cublasHandle_t cub_handle, cusparseHandle_t cus_handle, cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, 
+		const int n_size, const int nz_size, cusparseOperation_t oper_t)
+	{
+		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, x, dvec_p, 
+			CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L);
+		
+		cusparseSpSV_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, prod_Ax, 
+			CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT);
+		return;
+	}
+
+private:
+	int N, nz, lnz;
+	int *rowIdxA, *colIdxA;
+	cuDoubleComplex *A, *b;
+	cuDoubleComplex *ans_x;
+    cuComplex *Af, *bf;
+	cuComplex *ans_xf;
+
+	int *L_row, *L_col;
+    cuComplex *L_val;
+
+	void *d_buf, *d_buf2;
+	cusparseSpMatDescr_t smat_A;
+	cusparseSpMatDescr_t smat_L;
+	cusparseSpSVDescr_t descr_L, descr_LT;
+
+	int *d_rowIdxA; // COO
+	int *d_rowPtrA; // CSR
+	int *d_colIdxA;
+	cuComplex *d_A;
+	cuComplex *d_t;
+	cuComplex *d_p;
+	cusparseDnVecDescr_t dvec_p;
+
+	int *d_rowIdxL; // COO
+	int *d_rowPtrL; // CSR
+	int *d_colIdxL;
+	cuComplex *d_L;
+
+	cuComplex *host_m;
+	cusparseDnVecDescr_t dvec_tmp;
+};
+
+void sample14::solve(std::string inputPath, std::string answerPath, cublasHandle_t cub_handle, cusparseHandle_t cus_handle)
+{
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+	readAnswer(answerPath, &N, &ans_x);
+
+    clcg_incomplete_Cholesky_cuda_half_buffsize(rowIdxA, colIdxA, nz, &lnz);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+    std::clog << "lnz = " << lnz << std::endl;
+
+    Af = new cuComplex [nz];
+    bf = new cuComplex [nz];
+    ans_xf = new cuComplex [nz];
+
+	// Note that converting complex numbers from double to single precisions may case stack overflow
+    for (size_t i = 0; i < nz; i++)
+    {
+        Af[i].x = A[i].x; Af[i].y = A[i].y;
+        bf[i].x = b[i].x; bf[i].y = b[i].y;
+        ans_xf[i].x = ans_x[i].x; ans_xf[i].y = ans_x[i].y;
+    }
+    
+	L_row = new int [lnz];
+    L_col = new int [lnz];
+    L_val = new cuComplex [lnz];
+
+    clcg_incomplete_Cholesky_cuda_half(rowIdxA, colIdxA, Af, N, nz, lnz, L_row, L_col, L_val);
+/*
+    for (size_t i = 0; i < lnz; i++)
+    {
+        std::cout << L_row[i] << " " << L_col[i] << " (" << L_val[i].x << "," << L_val[i].y << ")\n";
+    }
+*/
+    // Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(cuComplex));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_t, N * sizeof(cuComplex));
+	cudaMalloc(&d_p, N * sizeof(cuComplex));
+    cusparseCreateDnVec(&dvec_p, N, d_p, CUDA_C_32F);
+
+	cudaMemcpy(d_A, Af, nz * sizeof(cuComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_t, bf, N * sizeof(cuComplex), cudaMemcpyHostToDevice);
+
+    cudaMalloc(&d_L, lnz * sizeof(cuComplex));
+	cudaMalloc(&d_rowIdxL, lnz * sizeof(int));
+	cudaMalloc(&d_rowPtrL, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxL, lnz * sizeof(int));
+
+    cudaMemcpy(d_L, L_val, lnz * sizeof(cuComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxL, L_row, lnz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxL, L_col, lnz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cus_handle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F);
+
+	// Convert matrix L from COO format to CSR format
+    cusparseXcoo2csr(cus_handle, d_rowIdxL, lnz, N, d_rowPtrL, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+    cusparseCreateCsr(&smat_L, N, N, lnz, d_rowPtrL, d_colIdxL, d_L, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F);
+
+    // Specify Lower fill mode.
+    cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_LOWER;
+	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode));
+
+	// Specify Non-Unit diagonal type.
+    cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
+	cusparseSpMatSetAttribute(smat_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype));
+
+    // This is just used to get bufferSize;
+	cusparseCreateDnVec(&dvec_tmp, N, d_t, CUDA_C_32F);
+
+	size_t bufferSize_B;
+	cusparseSpMV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize_B);
+
+    // --- Start of the preconditioning part ---
+    cusparseSpSV_createDescr(&descr_L);
+    cusparseSpSV_createDescr(&descr_LT);
+
+    size_t bufferSize, bufferSize_L, bufferSize_LT;
+	bufferSize = bufferSize_B;
+
+    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_p, 
+        dvec_tmp, CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, &bufferSize_L);
+    cusparseSpSV_bufferSize(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, 
+        dvec_tmp, CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, &bufferSize_LT);
+
+    bufferSize = max(max(bufferSize, bufferSize_L), bufferSize_LT);
+	cudaMalloc(&d_buf, bufferSize);
+	cudaMalloc(&d_buf2, bufferSize);
+
+	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_L, dvec_tmp, dvec_p, 
+		CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_L, d_buf);
+
+	cusparseSpSV_analysis(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, &one, smat_L, dvec_p, dvec_tmp, 
+		CUDA_C_32F, CUSPARSE_SPSV_ALG_DEFAULT, descr_LT, d_buf2);
+	// --- End of the preconditioning part ---
+
+	// Declare an initial solution
+    clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	host_m = new cuComplex[N];
+
+	// Preconditioning with incomplete-chelosky factorization
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+	MinimizePreconditioned(cub_handle, cus_handle, host_m, bf, N, nz, CLCG_PCG);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_xf, N) << std::endl;
+
+	// Free Host memory
+	if (rowIdxA != nullptr) delete[] rowIdxA;
+	if (colIdxA != nullptr) delete[] colIdxA;
+    if (A != nullptr) delete[] A;
+	if (b != nullptr) delete[] b;
+	if (ans_x != nullptr) delete[] ans_x;
+    if (Af != nullptr) delete[] Af;
+	if (bf != nullptr) delete[] bf;
+	if (ans_xf != nullptr) delete[] ans_xf;
+
+    if (L_row != nullptr) delete[] L_row;
+    if (L_col != nullptr) delete[] L_col;
+    if (L_val != nullptr) delete[] L_val;
+
+    if (host_m != nullptr) delete[] host_m;
+
+	cusparseDestroyDnVec(dvec_tmp);
+    cusparseDestroyDnVec(dvec_p);
+
+	cudaFree(d_buf);
+	cudaFree(d_buf2);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+    cudaFree(d_A);
+	cudaFree(d_t);
+	cudaFree(d_p);
+
+    cudaFree(d_rowIdxL);
+	cudaFree(d_rowPtrL);
+	cudaFree(d_colIdxL);
+    cudaFree(d_L);
+
+	cusparseDestroySpMat(smat_A);
+	cusparseDestroySpMat(smat_L);
+    cusparseSpSV_destroyDescr(descr_L);
+    cusparseSpSV_destroyDescr(descr_LT);
+	return;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_1K_cA";
+	std::string answerPath = "data/case_1K_cB";
+
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	sample14 sp;
+	sp.set_report_interval(100);
+	sp.solve(inputPath, answerPath, cubHandle, cusHandle);
+
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+	return 0;
+}
--- a/src/sample/sample15.cu
+++ b/src/sample/sample15.cu
@@ -0,0 +1,223 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/lcg_cuda.h"
+
+void read(std::string filePath, int *pN, int *pnz, double **cooVal,
+	int **cooRowIdx, int **cooColIdx, double **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new double[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new double[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(double));
+	}
+
+	in.read((char*)(*b), sizeof(double)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, double **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new double[*pN]{};
+
+	in.read((char*)(*x), sizeof(double)*(*pN));
+    return;
+}
+
+lcg_float avg_error(lcg_float *a, lcg_float *b, int n)
+{
+	lcg_float avg = 0.0;
+	for (size_t i = 0; i < n; i++)
+	{
+		avg += (a[i] - b[i])*(a[i] - b[i]);
+	}
+	return sqrt(avg)/n;
+}
+
+// Declare as global variables
+lcg_float one = 1.0;
+lcg_float zero = 0.0;
+
+void *d_buf;
+cusparseSpMatDescr_t smat_A;
+
+int *d_rowIdxA; // COO
+int *d_rowPtrA; // CSR
+int *d_colIdxA;
+double *d_A;
+
+cusparseMatDescr_t descr_A = 0;
+csric02Info_t icinfo_A = 0;
+
+void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
+{
+	// Calculate the product of A*x
+	cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
+    return;
+}
+
+int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, 
+	const lcg_para* param, const int n_size, const int nz_size, const int k)
+{
+    if (converge <= param->epsilon) {
+		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_1M_A";
+	std::string answerPath = "data/case_1M_B";
+
+	int N;
+	int nz;
+	double *A;
+	int *rowIdxA;
+	int *colIdxA;
+	double *b;
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	double *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+	
+	// Create handles
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	// Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(double));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+
+	cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+	// create descriptor for matrix A
+	cusparseCreateMatDescr(&descr_A);
+
+	// initialize properties of matrix A
+	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
+	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
+
+	int bufferSize;
+	cusparseCreateCsric02Info(&icinfo_A);
+	cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
+		d_colIdxA, icinfo_A, &bufferSize);
+
+	cudaMalloc(&d_buf, bufferSize);
+
+	// Declare an initial solution
+    lcg_para self_para = lcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	int ret;
+	double *host_m = new double[N];
+
+	// Solve with CG
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i] = 0.0;
+	}
+
+    ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Solve with CGS
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i] = 0.0;
+	}
+
+	ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	delete[] host_m;
+
+	// Free Device memory
+	cudaFree(d_A);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+
+	cusparseDestroySpMat(smat_A);
+	cudaFree(d_buf);
+
+	cusparseDestroyMatDescr(descr_A);
+	cusparseDestroyCsric02Info(icinfo_A);
+
+	// Free handles
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+
+	return 0;
+}
--- a/src/sample/sample2.cpp
+++ b/src/sample/sample2.cpp
@@ -0,0 +1,170 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "iostream"
+#include "random"
+#include "../lib/solver.h"
+
+#define M 1000
+#define N 800
+
+lcg_float max_diff(const lcg_float *a, const lcg_float *b, int size)
+{
+	lcg_float max = -1;
+	for (int i = 0; i < size; i++)
+	{
+		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
+	}
+	return max;
+}
+
+class TESTFUNC : public LCG_Solver
+{
+public:
+	TESTFUNC();
+	~TESTFUNC();
+
+	// 计算共轭梯度的B项
+	void cal_partb(lcg_float *B, const lcg_float *x);
+
+	//定义共轭梯度中Ax的算法
+	void AxProduct(const lcg_float* a, lcg_float* b, const int num)
+	{
+		lcg_matvec(kernel, a, tmp_arr, M, num, MatNormal);
+		lcg_matvec(kernel, tmp_arr, b, M, num, MatTranspose);
+		return;
+	}
+
+	void MxProduct(const lcg_float* a, lcg_float* b, const int num)
+	{
+		for (size_t i = 0; i < num; i++)
+		{
+			b[i] = p[i]*a[i];
+		}
+		return;
+	}
+
+private:
+	// 普通二维数组做核矩阵
+	lcg_float **kernel;
+	// 中间结果数组
+	lcg_float *tmp_arr;
+	// 预优矩阵
+	lcg_float *p;
+};
+
+TESTFUNC::TESTFUNC()
+{
+	kernel = lcg_malloc(M, N);
+	tmp_arr = lcg_malloc(M);
+	p = lcg_malloc(N);
+
+	lcg_vecrnd(kernel, -1.0, 1.0, M, N);
+	lcg_vecset(p, 1.0, N);
+
+	lcg_float diag;
+	for (size_t i = 0; i < N; i++)
+	{
+		diag = 0.0;
+		for (size_t j = 0; j < M; j++)
+		{
+			diag += kernel[j][i]*kernel[j][i];
+		}
+		p[i] = 1.0/diag;
+	}
+}
+
+TESTFUNC::~TESTFUNC()
+{
+	lcg_free(kernel, M);
+	lcg_free(tmp_arr);
+	lcg_free(p);
+}
+
+void TESTFUNC::cal_partb(lcg_float *B, const lcg_float *x)
+{
+	lcg_matvec(kernel, x, tmp_arr, M, N, MatNormal);
+	lcg_matvec(kernel, tmp_arr, B, M, N, MatTranspose);
+}
+
+int main(int argc, char const *argv[])
+{
+	// 生成一组正演解
+	double *fm = lcg_malloc(N);
+	lcg_vecrnd(fm, 1.0, 2.0, N);
+
+	TESTFUNC test;
+
+	// 计算共轭梯度B项
+	double *B = lcg_malloc(N);
+	test.cal_partb(B, fm);
+
+	/********************准备工作完成************************/
+	lcg_para self_para = lcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+	test.set_lcg_parameter(self_para);
+
+	// 声明一组解
+	lcg_float *m = lcg_malloc(N);
+	lcg_vecset(m, 0.0, N);
+
+	// 约束解的范围
+	lcg_float *low = lcg_malloc(N);
+	lcg_float *hig = lcg_malloc(N);
+	lcg_vecset(low, 1.0, N);
+	lcg_vecset(hig, 2.0, N);
+
+	test.Minimize(m, B, N, LCG_CG);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	test.MinimizePreconditioned(m, B, N);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	test.Minimize(m, B, N, LCG_CGS);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	test.Minimize(m, B, N, LCG_BICGSTAB);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	test.Minimize(m, B, N, LCG_BICGSTAB2);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	test.MinimizeConstrained(m, B, low, hig, N, LCG_PG);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_vecset(m, 0.0, N);
+	test.MinimizeConstrained(m, B, low, hig, N, LCG_SPG);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	lcg_free(fm);
+	lcg_free(B);
+	lcg_free(m);
+	lcg_free(low);
+	lcg_free(hig);
+	return 0;
+}
--- a/src/sample/sample3.cpp
+++ b/src/sample/sample3.cpp
@@ -0,0 +1,129 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "iostream"
+#include "../lib/clcg.h"
+
+#define N 100
+
+lcg_float max_diff(const lcg_complex *a, const lcg_complex *b, int size)
+{
+	lcg_float max = -1;
+	lcg_complex t;
+	for (int i = 0; i < size; i++)
+	{
+		t = a[i] - b[i];
+		max = lcg_max(clcg_module(&t), max);
+	}
+	return max;
+}
+
+// 普通二维数组做核矩阵
+lcg_complex **kernel;
+
+// 计算核矩阵乘向量的乘积
+void CalAx(void *instance, const lcg_complex *x, lcg_complex *prod_Ax, 
+	const int x_size, lcg_matrix_e layout, clcg_complex_e conjugate)
+{
+	clcg_matvec(kernel, x, prod_Ax, N, x_size, layout, conjugate);
+	return;
+}
+
+
+//定义共轭梯度监控函数
+int Prog(void* instance, const lcg_complex* m, const lcg_float converge, 
+	const clcg_para* param, const int n_size, const int k)
+{
+	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	return 0;
+}
+
+int main(int argc, char const *argv[])
+{
+	srand(time(0));
+
+	kernel = clcg_malloc(N, N);
+	clcg_vecrnd(kernel, lcg_complex(-1.0, -1.0), lcg_complex(1.0, 1.0), N, N);
+
+	// 设置核矩阵为一个对称阵
+	for (int i = 0; i < N; i++)
+	{
+		for (int j = i; j < N; j++)
+		{
+			kernel[j][i] = kernel[i][j];
+		}
+	}
+
+	// 生成一组正演解
+	lcg_complex *fm = clcg_malloc(N);
+	clcg_vecrnd(fm, lcg_complex(1.0, 1.0), lcg_complex(2.0, 2.0), N);
+
+	// 计算共轭梯度B项
+	lcg_complex *B = clcg_malloc(N);
+	clcg_matvec(kernel, fm, B, N, N, MatNormal, NonConjugate);
+
+	/********************准备工作完成************************/
+	clcg_para self_para = clcg_default_parameters();
+	self_para.abs_diff = 0;
+	self_para.epsilon = 1e-8;
+
+	// 声明一组解
+	lcg_complex *m = clcg_malloc(N);
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+
+	int ret;
+
+	std::clog << "solver: bicg" << std::endl;
+	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_BICG);
+	std::clog << std::endl; clcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	std::clog << "solver: bicg-symmetric" << std::endl;
+	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_BICG_SYM);
+	std::clog << std::endl; clcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	std::clog << "solver: cgs" << std::endl;
+	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_CGS);
+	std::clog << std::endl; clcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	std::clog << "solver: bicgstab" << std::endl;
+	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_BICGSTAB);
+	std::clog << std::endl; clcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	std::clog << "solver: tfqmr" << std::endl;
+	ret = clcg_solver(CalAx, Prog, m, B, N, &self_para, NULL, CLCG_TFQMR);
+	std::clog << std::endl; clcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m, N) << std::endl << std::endl;
+
+	clcg_free(kernel, N);
+	clcg_free(fm);
+	clcg_free(B);
+	clcg_free(m);
+	return 0;
+}
--- a/src/sample/sample4.cpp
+++ b/src/sample/sample4.cpp
@@ -0,0 +1,199 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "../lib/solver.h"
+#include "ctime"
+#include "random"
+#include "iostream"
+#include "fstream"
+#include "iomanip"
+#include "complex"
+
+void read(std::string filePath, int *pN, int *pnz, lcg_complex **cooVal,
+	int **cooRowIdx, int **cooColIdx, lcg_complex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new lcg_complex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new lcg_complex[*pN]{};
+
+	std::complex<double> std_c;
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&std_c, sizeof(std_c));
+		(*cooVal)[i].real(std_c.real());
+		(*cooVal)[i].imag(std_c.imag());
+	}
+
+	for (int i = 0; i < *pN; i++)
+	{
+		in.read((char*)&std_c, sizeof(std_c));
+		(*b)[i].real(std_c.real());
+		(*b)[i].imag(std_c.imag());
+	}
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, lcg_complex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new lcg_complex[*pN]{};
+
+	std::complex<double> std_c;
+	for (size_t i = 0; i < *pN; i++)
+	{
+		in.read((char*)&std_c, sizeof(std_c));
+		(*x)[i].real(std_c.real());
+		(*x)[i].imag(std_c.imag());
+	}
+    return;
+}
+
+lcg_float max_diff(const lcg_complex *a, const lcg_complex *b, int size)
+{
+	lcg_float max = -1;
+	lcg_complex t;
+	for (int i = 0; i < size; i++)
+	{
+		t = a[i] - b[i];
+		max = lcg_max(clcg_module(&t), max);
+	}
+	return max;
+}
+
+class TESTFUNC : public CLCG_Solver
+{
+public:
+	TESTFUNC(int n);
+	~TESTFUNC();
+
+	void set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size);
+
+	//定义共轭梯度中Ax的算法
+	void AxProduct(const lcg_complex *x, lcg_complex *prod_Ax, const int x_size, 
+		lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		clcg_matvec(kernel, x, prod_Ax, x_size, x_size, layout, conjugate);
+		return;
+	}
+
+private:
+	// 普通二维数组做核矩阵
+	lcg_complex **kernel;
+	int n_size;
+};
+
+TESTFUNC::TESTFUNC(int n)
+{
+	n_size = n;
+	kernel = clcg_malloc(n_size, n_size);
+}
+
+TESTFUNC::~TESTFUNC()
+{
+	clcg_free(kernel, n_size);
+}
+
+void TESTFUNC::set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size)
+{
+	for (size_t i = 0; i < n_size; i++)
+	{
+		for (size_t j = 0; j < n_size; j++)
+		{
+			kernel[i][j] = lcg_complex(0.0, 0.0);
+		}
+	}
+	
+	for (size_t i = 0; i < nz_size; i++)
+	{
+		kernel[row_id[i]][col_id[i]] = val[i];
+	}
+	return;
+}
+
+int main(int argc, char const *argv[])
+{
+	std::string inputPath = "data/case_1K_cA";
+	std::string answerPath = "data/case_1K_cB";
+
+	int N;
+	int nz;
+	lcg_complex *A;
+	int *rowIdxA;
+	int *colIdxA;
+	lcg_complex *b;
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	lcg_complex *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	TESTFUNC test(N);
+	test.set_kernel(rowIdxA, colIdxA, A, nz);
+
+	/********************准备工作完成************************/
+	clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-8;
+	self_para.abs_diff = 0;
+
+	test.set_clcg_parameter(self_para);
+
+	// 声明一组解
+	lcg_complex *m = clcg_malloc(N);
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+
+	test.Minimize(m, b, N, CLCG_BICG);
+	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	test.Minimize(m, b, N, CLCG_BICG_SYM);
+	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	test.Minimize(m, b, N, CLCG_CGS);
+	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
+
+	clcg_vecset(m, lcg_complex(0.0, 0.0), N);
+	test.Minimize(m, b, N, CLCG_TFQMR);
+	std::clog << "maximal difference: " << max_diff(ans_x, m, N) << std::endl << std::endl;
+
+	clcg_free(m);
+
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	return 0;
+}
--- a/src/sample/sample5.cpp
+++ b/src/sample/sample5.cpp
@@ -0,0 +1,155 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "../lib/lcg_eigen.h"
+#include "iostream"
+#include "Eigen/Dense"
+
+#define M 1000
+#define N 800
+
+lcg_float max_diff(const Eigen::VectorXd &a, const Eigen::VectorXd &b)
+{
+	lcg_float max = -1;
+	for (int i = 0; i < a.size(); i++)
+	{
+		max = lcg_max(sqrt((a[i] - b[i])*(a[i] - b[i])), max);
+	}
+	return max;
+}
+
+// 普通二维数组做核矩阵
+Eigen::MatrixXd kernel = Eigen::MatrixXd::Random(M, N);
+// 中间结果数组
+Eigen::VectorXd tmp_arr(M);
+Eigen::VectorXd p = Eigen::VectorXd::Constant(N, 1.0);
+
+// 计算核矩阵乘向量的乘积
+void CalAx(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Ax)
+{
+	tmp_arr = kernel * x;
+	prod_Ax = kernel.transpose() * tmp_arr;
+	return;
+}
+
+void CalMx(void* instance, const Eigen::VectorXd &x, Eigen::VectorXd &prod_Mx)
+{
+	prod_Mx = p.cwiseProduct(x);
+	return;
+}
+
+//定义共轭梯度监控函数
+int Prog(void* instance, const Eigen::VectorXd *m, const lcg_float converge, 
+	const lcg_para *param, const int k)
+{
+	std::clog << "\rIteration-times: " << k << "\tconvergence: " << converge;
+	return 0;
+}
+
+int main(int argc, char const *argv[])
+{
+	// 生成一组正演解
+	lcg_float LO = 1.0, HI = 2.0, Range = HI - LO;
+	Eigen::VectorXd fm = Eigen::VectorXd::Random(N);
+	fm = (fm + Eigen::VectorXd::Constant(N, 1.0))*0.5*Range;
+	fm = (fm + Eigen::VectorXd::Constant(N, LO));
+
+	// 计算共轭梯度B项
+	Eigen::VectorXd B(N);
+	tmp_arr = kernel * fm;
+	B = kernel.transpose() * tmp_arr;
+
+	/********************准备工作完成************************/
+	lcg_para self_para = lcg_default_parameters();
+	self_para.epsilon = 1e-5;
+	self_para.abs_diff = 0;
+
+	// 声明一组解
+	Eigen::VectorXd m = Eigen::VectorXd::Zero(N);
+	//Eigen::VectorXd p = Eigen::VectorXd::Constant(N, 1.0);
+	Eigen::VectorXd low = Eigen::VectorXd::Constant(N, LO);
+	Eigen::VectorXd hig = Eigen::VectorXd::Constant(N, HI);
+
+	std::clog << "solver: cg" << std::endl;
+	clock_t start = clock();
+	int ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_CG);
+	clock_t end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	m.setZero();
+	std::clog << "solver: pcg" << std::endl;
+	start = clock();
+	ret = lcg_solver_preconditioned_eigen(CalAx, CalMx, Prog, m, B, &self_para, NULL, LCG_PCG);
+	end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	m.setZero();
+	std::clog << "solver: cgs" << std::endl;
+	start = clock();
+	ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_CGS);
+	end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	m.setZero();
+	std::clog << "solver: bicgstab" << std::endl;
+	start = clock();
+	ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_BICGSTAB);
+	end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	m.setZero();
+	std::clog << "solver: bicgstab2" << std::endl;
+	start = clock();
+	ret = lcg_solver_eigen(CalAx, Prog, m, B, &self_para, NULL, LCG_BICGSTAB2);
+	end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	m.setZero();
+	std::clog << "solver: pg" << std::endl;
+	start = clock();
+	ret = lcg_solver_constrained_eigen(CalAx, Prog, m, B, low, hig, &self_para, NULL, LCG_PG);
+	end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	m.setZero();
+	std::clog << "solver: spg" << std::endl;
+	start = clock();
+	ret = lcg_solver_constrained_eigen(CalAx, Prog, m, B, low, hig, &self_para, NULL, LCG_SPG);
+	end = clock();
+	std::clog << std::endl; lcg_error_str(ret);
+	std::clog << "maximal difference: " << max_diff(fm, m) << std::endl;
+	std::clog << "time use: "<<1000*(end-start)/(double)CLOCKS_PER_SEC<<" ms" << std::endl;
+
+	return 0;
+}
--- a/src/sample/sample6.cpp
+++ b/src/sample/sample6.cpp
@@ -0,0 +1,235 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "iostream"
+#include "fstream"
+#include "complex"
+#include "../lib/lcg_complex.h"
+#include "../lib/solver_eigen.h"
+#include "Eigen/Sparse"
+
+typedef Eigen::SparseMatrix<lcg_complex, Eigen::RowMajor> spmat_cd; // 注意Eigen默认的稀疏矩阵排序为列优先
+typedef Eigen::Triplet<lcg_complex> triplt_cd;
+
+void read(std::string filePath, int *pN, int *pnz, lcg_complex **cooVal, 
+	int **cooRowIdx, int **cooColIdx, lcg_complex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new lcg_complex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new lcg_complex[*pN]{};
+
+	std::complex<double> std_c;
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&std_c, sizeof(std_c));
+		(*cooVal)[i].real(std_c.real());
+		(*cooVal)[i].imag(std_c.imag());
+	}
+
+	for (int i = 0; i < *pN; i++)
+	{
+		in.read((char*)&std_c, sizeof(std_c));
+		(*b)[i].real(std_c.real());
+		(*b)[i].imag(std_c.imag());
+	}
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, lcg_complex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new lcg_complex[*pN]{};
+
+	std::complex<double> std_c;
+	for (size_t i = 0; i < *pN; i++)
+	{
+		in.read((char*)&std_c, sizeof(std_c));
+		(*x)[i].real(std_c.real());
+		(*x)[i].imag(std_c.imag());
+	}
+    return;
+}
+
+lcg_float max_diff(const Eigen::VectorXcd &a, const Eigen::VectorXcd &b)
+{
+	lcg_float max = -1;
+	std::complex<lcg_float> t;
+	for (int i = 0; i < a.size(); i++)
+	{
+		t = a[i] - b[i];
+		max = lcg_max(t.real()*t.real() + t.imag()*t.imag(), max);
+	}
+	return max;
+}
+
+class TESTFUNC : public CLCG_EIGEN_Solver
+{
+public:
+	TESTFUNC(int n);
+	~TESTFUNC();
+
+	void set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size);
+	void set_p();
+
+	//定义共轭梯度中Ax的算法
+	void AxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Ax, 
+		lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		if (conjugate == Conjugate) prod_Ax = kernel.conjugate() * x;
+		else prod_Ax = kernel * x;
+		return;
+	}
+
+	void MxProduct(const Eigen::VectorXcd &x, Eigen::VectorXcd &prod_Mx, 
+		lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		prod_Mx = P.cwiseProduct(x);
+		return;
+	}
+
+private:
+	spmat_cd kernel;
+	Eigen::VectorXcd P;
+	int n_size;
+};
+
+TESTFUNC::TESTFUNC(int n)
+{
+	n_size = n;
+	kernel.resize(n_size, n_size);
+	kernel.setZero();
+	P.resize(n_size);
+}
+
+TESTFUNC::~TESTFUNC()
+{
+	kernel.resize(0, 0);
+}
+
+void TESTFUNC::set_kernel(int *row_id, int *col_id, lcg_complex *val, int nz_size)
+{
+	std::vector<triplt_cd> val_triplt;
+	for (size_t i = 0; i < nz_size; i++)
+	{
+		val_triplt.push_back(triplt_cd(row_id[i], col_id[i], val[i]));
+	}
+
+	kernel.setFromTriplets(val_triplt.begin(), val_triplt.end());
+	return;
+}
+
+void TESTFUNC::set_p()
+{
+	for (size_t i = 0; i < n_size; i++)
+	{
+		P[i] = 1.0/kernel.coeff(i, i);
+	}
+	return;
+}
+
+int main(int argc, char const *argv[])
+{
+	std::string inputPath = "data/case_10K_cA";
+	std::string answerPath = "data/case_10K_cB";
+
+	int N;
+	int nz;
+	lcg_complex *A;
+	int *rowIdxA;
+	int *colIdxA;
+	lcg_complex *b;
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	lcg_complex *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	TESTFUNC test(N);
+	test.set_kernel(rowIdxA, colIdxA, A, nz);
+	test.set_p();
+
+	Eigen::VectorXcd B, ANS;
+	B.resize(N);
+	ANS.resize(N);
+	for (size_t i = 0; i < N; i++)
+	{
+		B[i] = b[i];
+		ANS[i] = ans_x[i];
+	}
+
+	/********************准备工作完成************************/
+	clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-16;
+	self_para.abs_diff = 0;
+	test.set_clcg_parameter(self_para);
+	test.set_report_interval(10);
+
+	// 声明一组解
+	Eigen::VectorXcd m = Eigen::VectorXcd::Constant(N, std::complex<double>(0.0, 0.0));
+
+	test.Minimize(m, B, CLCG_BICG);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	m.setZero();
+	test.Minimize(m, B, CLCG_BICG_SYM);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	m.setZero();
+	test.Minimize(m, B, CLCG_CGS);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	m.setZero();
+	test.Minimize(m, B, CLCG_TFQMR);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	m.setZero();
+	test.MinimizePreconditioned(m, B, CLCG_PCG);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	m.setZero();
+	test.MinimizePreconditioned(m, B, CLCG_PBICG);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	B.resize(0);
+	ANS.resize(0);
+	m.resize(0);
+
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	return 0;
+}
--- a/src/sample/sample7.cpp
+++ b/src/sample/sample7.cpp
@@ -0,0 +1,233 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include "iostream"
+#include "fstream"
+#include "../lib/solver_eigen.h"
+#include "../lib/preconditioner_eigen.h"
+
+typedef std::complex<double> complex_d;
+typedef Eigen::SparseMatrix<std::complex<double>, Eigen::RowMajor> spmat_cd;
+typedef Eigen::Triplet<complex_d> triplt_cd;
+typedef Eigen::VectorXcd vector_cd;
+
+void read(std::string filePath, int *pN, int *pnz, complex_d **cooVal, 
+	int **cooRowIdx, int **cooColIdx, complex_d **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new complex_d[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new complex_d[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(complex_d));
+	}
+
+	in.read((char*)(*b), sizeof(complex_d)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, complex_d **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new complex_d[*pN]{};
+
+	in.read((char*)(*x), sizeof(complex_d)*(*pN));
+    return;
+}
+
+double max_diff(const vector_cd &a, const vector_cd &b)
+{
+	double max = -1;
+	complex_d t;
+	for (int i = 0; i < a.size(); i++)
+	{
+		t = a[i] - b[i];
+		max = lcg_max(std::sqrt(std::norm(t)), max);
+	}
+	return max;
+}
+
+class TESTFUNC : public CLCG_EIGEN_Solver
+{
+public:
+	TESTFUNC(int n);
+	~TESTFUNC();
+
+	void set_kernel(int *row_id, int *col_id, complex_d *val, int nz_size);
+	void set_preconditioner();
+
+	//定义共轭梯度中Ax的算法
+	void AxProduct(const vector_cd &x, vector_cd &prod_Ax, lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		if (conjugate == Conjugate) prod_Ax = kernel.conjugate() * x;
+		else prod_Ax = kernel * x;
+		return;
+	}
+
+	void MxProduct(const vector_cd &x, vector_cd &prod_Mx, lcg_matrix_e layout, clcg_complex_e conjugate)
+	{
+		// No preconditioning
+		//prod_Mx = x;
+
+		// Preconditioning using the diagonal kernel
+		//prod_Mx = p.cwiseProduct(x);
+
+		// Preconditioning using the ILUT/IC
+		clcg_solve_lower_triangle(l_tri, x, p);
+		clcg_solve_upper_triangle(u_tri, p, prod_Mx);
+		return;
+	}
+
+private:
+	// 普通二维数组做核矩阵
+	spmat_cd kernel, l_tri, u_tri;
+	vector_cd p;
+	int n_size;
+};
+
+TESTFUNC::TESTFUNC(int n)
+{
+	n_size = n;
+	kernel.resize(n_size, n_size);
+	kernel.setZero();
+	p.resize(n_size);
+}
+
+TESTFUNC::~TESTFUNC()
+{
+	kernel.resize(0, 0);
+	l_tri.resize(0, 0);
+	u_tri.resize(0, 0);
+	p.resize(0);
+}
+
+void TESTFUNC::set_kernel(int *row_id, int *col_id, complex_d *val, int nz_size)
+{
+	std::vector<triplt_cd> val_triplt;
+	for (size_t i = 0; i < nz_size; i++)
+	{
+		val_triplt.push_back(triplt_cd(row_id[i], col_id[i], val[i]));
+	}
+
+	kernel.setFromTriplets(val_triplt.begin(), val_triplt.end());
+	return;
+}
+
+void TESTFUNC::set_preconditioner()
+{
+	// 1 Preconditioning using the incomplete LU decomposition
+	/*
+	for (size_t i = 0; i < n_size; i++)
+	{
+		p[i] = 1.0/kernel.coeff(i, i);
+	}
+	*/
+
+	// 2. Preconditioning using the incomplete LU decomposition
+	//incomplete_LU(kernel, l_tri, u_tri);
+
+	// 3. Preconditioning using the incomplete Cholesky decomposition
+	clcg_incomplete_Cholesky(kernel, l_tri);
+	u_tri = l_tri.transpose();
+
+	// 4. Preconditioning using compressed incomplete decompositions
+	/*
+	vector_cd one = Eigen::VectorXcd::Ones(n_size);
+	vector_cd x = Eigen::VectorXcd::Zero(n_size);
+
+	solve_lower_triangle(l_tri, one, x);
+	solve_upper_triangle(u_tri, x, p);
+	*/
+	return;
+}
+
+int main(int argc, char const *argv[]) try
+{
+	std::string inputPath = "data/case_1K_cA";
+	std::string answerPath = "data/case_1K_cB";
+
+	int N;
+	int nz;
+	complex_d *A;
+	int *rowIdxA;
+	int *colIdxA;
+	complex_d *b;
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	complex_d *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	TESTFUNC test(N);
+	test.set_kernel(rowIdxA, colIdxA, A, nz);
+	test.set_preconditioner();
+
+	vector_cd B, ANS;
+	B.resize(N);
+	ANS.resize(N);
+	for (size_t i = 0; i < N; i++)
+	{
+		B[i] = b[i];
+		ANS[i] = ans_x[i];
+	}
+
+	/********************准备工作完成************************/
+
+	clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-12;
+	self_para.abs_diff = 0;
+	test.set_clcg_parameter(self_para);
+	test.set_report_interval(10);
+
+	Eigen::VectorXcd m = Eigen::VectorXcd::Constant(N, std::complex<double>(0.0, 0.0));
+
+	test.MinimizePreconditioned(m, B, CLCG_PCG);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	m.setZero();
+	test.MinimizePreconditioned(m, B, CLCG_PBICG);
+	std::clog << "maximal difference: " << max_diff(ANS, m) << std::endl << std::endl;
+
+	ANS.resize(0);
+	B.resize(0);
+	m.resize(0);
+
+	return 0;
+}
+catch (std::exception &e)
+{
+	std::cerr << e.what() << std::endl;
+}
--- a/src/sample/sample8.cu
+++ b/src/sample/sample8.cu
@@ -0,0 +1,312 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/lcg_cuda.h"
+
+void read(std::string filePath, int *pN, int *pnz, double **cooVal,
+	int **cooRowIdx, int **cooColIdx, double **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new double[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new double[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(double));
+	}
+
+	in.read((char*)(*b), sizeof(double)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, double **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new double[*pN]{};
+
+	in.read((char*)(*x), sizeof(double)*(*pN));
+    return;
+}
+
+lcg_float avg_error(lcg_float *a, lcg_float *b, int n)
+{
+	lcg_float avg = 0.0;
+	for (size_t i = 0; i < n; i++)
+	{
+		avg += (a[i] - b[i])*(a[i] - b[i]);
+	}
+	return sqrt(avg)/n;
+}
+
+// Declare as global variables
+lcg_float one = 1.0;
+lcg_float zero = 0.0;
+
+void *d_buf;
+cusparseSpMatDescr_t smat_A;
+
+int *d_rowIdxA; // COO
+int *d_rowPtrA; // CSR
+int *d_colIdxA;
+double *d_A;
+double *d_pd;
+double *d_ic;
+
+cusparseMatDescr_t descr_A = 0;
+cusparseMatDescr_t descr_L = 0;
+csric02Info_t icinfo_A = 0;
+csrsv2Info_t info_L = 0;
+csrsv2Info_t info_LT = 0;
+
+void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
+{
+	// Calculate the product of A*x
+	cusparseSpMV(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		x, &zero, prod_Ax, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, d_buf);
+    return;
+}
+
+void cudaMx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size)
+{
+	void *d_x, *d_Ax;
+	cusparseDnVecGetValues(x, &d_x);
+	cusparseDnVecGetValues(prod_Ax, &d_Ax);
+
+	cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, (double*) d_x, (double*) d_pd, 
+		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+	cusparseDcsrsv2_solve(cus_handle, CUSPARSE_OPERATION_TRANSPOSE, 
+		n_size, nz_size, &one, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, (double*) d_pd, (double*) d_Ax, 
+		CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+    return;
+}
+
+int cudaProgress(void* instance, const lcg_float* m, const lcg_float converge, 
+	const lcg_para* param, const int n_size, const int nz_size, const int k)
+{
+    if (converge <= param->epsilon) {
+		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_10K_A";
+	std::string answerPath = "data/case_10K_B";
+
+	int N;
+	int nz;
+	double *A;
+	int *rowIdxA;
+	int *colIdxA;
+	double *b;
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	double *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+	
+	// Create handles
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	// Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(double));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_pd, N * sizeof(double));
+
+	cudaMemcpy(d_A, A, nz * sizeof(double), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+	// This is just used to get bufferSize;
+	cusparseDnVecDescr_t dvec_tmp;
+	cusparseCreateDnVec(&dvec_tmp, N, d_pd, CUDA_R_64F);
+
+	size_t bufferSize_B;
+	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
+
+	// --- Start of the preconditioning part ---
+
+	// Copy A
+	cudaMalloc(&d_ic, nz * sizeof(lcg_float));
+	cudaMemcpy(d_ic, d_A, nz * sizeof(lcg_float), cudaMemcpyDeviceToDevice);
+
+	int bufferSize, bufferSize_A, bufferSize_L, bufferSize_LT;
+	bufferSize = bufferSize_B;
+
+	// create descriptor for matrix A
+	cusparseCreateMatDescr(&descr_A);
+
+	// initialize properties of matrix A
+	cusparseSetMatType(descr_A, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatFillMode(descr_A, CUSPARSE_FILL_MODE_LOWER);
+	cusparseSetMatDiagType(descr_A, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	cusparseSetMatIndexBase(descr_A, CUSPARSE_INDEX_BASE_ZERO);
+
+	// create descriptor for matrix L
+	cusparseCreateMatDescr(&descr_L);
+
+	// initialize properties of matrix L
+	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create empty info objects for incomplete-cholesky factorization
+	cusparseCreateCsric02Info(&icinfo_A);
+	cusparseCreateCsrsv2Info(&info_L);
+	cusparseCreateCsrsv2Info(&info_LT);
+
+	// Compute buffer size in computing ic factorization
+	cusparseDcsric02_bufferSize(cusHandle, N, nz, descr_A, d_A, d_rowPtrA, 
+		d_colIdxA, icinfo_A, &bufferSize_A);
+	cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, &bufferSize_L);
+	cusparseDcsrsv2_bufferSize(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, &bufferSize_LT);
+	
+	bufferSize = max(max(max(bufferSize, bufferSize_A), bufferSize_L), bufferSize_LT);
+	cudaMalloc(&d_buf, bufferSize);
+
+	// Perform incomplete-choleskey factorization: analysis phase
+	cusparseDcsric02_analysis(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, 
+		d_colIdxA, icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_L, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+	cusparseDcsrsv2_analysis(cusHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+		N, nz, descr_L, d_ic, d_rowPtrA, d_colIdxA, info_LT, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+	// Perform incomplete-choleskey factorization: solve phase
+	cusparseDcsric02(cusHandle, N, nz, descr_A, d_ic, d_rowPtrA, d_colIdxA, 
+		icinfo_A, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_buf);
+
+	// --- End of the preconditioning part ---
+
+	// Declare an initial solution
+    lcg_para self_para = lcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	int ret;
+	double *host_m = new double[N];
+
+	// Solve with CG
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i] = 0.0;
+	}
+
+    ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CG);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Solve with CGS
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i] = 0.0;
+	}
+
+	ret = lcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_CGS);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Solve with PCG
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i] = 0.0;
+	}
+
+	ret = lcg_solver_preconditioned_cuda(cudaAx, cudaMx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, LCG_PCG);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	delete[] host_m;
+
+	// Free Device memory
+	cudaFree(d_A);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+	cudaFree(d_pd);
+	cudaFree(d_ic);
+
+	cusparseDestroyDnVec(dvec_tmp);
+	cusparseDestroySpMat(smat_A);
+	cudaFree(d_buf);
+
+	cusparseDestroyMatDescr(descr_A);
+	cusparseDestroyMatDescr(descr_L);
+	cusparseDestroyCsric02Info(icinfo_A);
+	cusparseDestroyCsrsv2Info(info_L);
+	cusparseDestroyCsrsv2Info(info_LT);
+
+	// Free handles
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+
+	return 0;
+}
--- a/src/sample/sample9.cu
+++ b/src/sample/sample9.cu
@@ -0,0 +1,221 @@
+/******************************************************
+ * C++ Library of the Linear Conjugate Gradient Methods (LibLCG)
+ * 
+ * Copyright (C) 2022  Yi Zhang (yizhang-geo@zju.edu.cn)
+ * 
+ * LibLCG is distributed under a dual licensing scheme. You can
+ * redistribute it and/or modify it under the terms of the GNU Lesser
+ * General Public License (LGPL) as published by the Free Software Foundation,
+ * either version 2 of the License, or (at your option) any later version. 
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with this program. If not, see <http://www.gnu.org/licenses/>. 
+ * 
+ * If the terms and conditions of the LGPL v.2. would prevent you from
+ * using the LibLCG, please consider the option to obtain a commercial
+ * license for a fee. These licenses are offered by the LibLCG developing 
+ * team. As a rule, licenses are provided "as-is", unlimited in time for 
+ * a one time fee. Please send corresponding requests to: yizhang-geo@zju.edu.cn. 
+ * Please do not forget to include some description of your company and the 
+ * realm of its activities. Also add information on how to contact you by 
+ * electronic and paper mail.
+ ******************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+
+#include "../lib/clcg_cuda.h"
+
+void read(std::string filePath, int *pN, int *pnz, cuDoubleComplex **cooVal,
+	int **cooRowIdx, int **cooColIdx, cuDoubleComplex **b)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+	in.read((char*)pnz, sizeof(int));
+
+	*cooVal = new cuDoubleComplex[*pnz]{};
+	*cooRowIdx = new int[*pnz]{};
+	*cooColIdx = new int[*pnz]{};
+	*b = new cuDoubleComplex[*pN]{};
+
+	for (int i = 0; i < *pnz; ++i)
+	{
+		in.read((char*)&(*cooRowIdx)[i], sizeof(int));
+		in.read((char*)&(*cooColIdx)[i], sizeof(int));
+		in.read((char*)&(*cooVal)[i], sizeof(cuDoubleComplex));
+	}
+
+	in.read((char*)(*b), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+void readAnswer(std::string filePath, int *pN, cuDoubleComplex **x)
+{
+	std::ifstream in(filePath, std::ios::binary);
+
+	in.read((char*)pN, sizeof(int));
+
+	*x = new cuDoubleComplex[*pN]{};
+
+	in.read((char*)(*x), sizeof(cuDoubleComplex)*(*pN));
+    return;
+}
+
+lcg_float avg_error(cuDoubleComplex *a, cuDoubleComplex *b, int n)
+{
+	lcg_float avg = 0.0;
+	cuDoubleComplex tmp;
+	for (size_t i = 0; i < n; i++)
+	{
+		tmp = clcg_Zdiff(a[i], b[i]);
+		avg += (tmp.x*tmp.x + tmp.y*tmp.y);
+	}
+	return sqrt(avg)/n;
+}
+
+// Declare as global variables
+cuDoubleComplex one, zero;
+
+void *d_buf;
+cusparseSpMatDescr_t smat_A;
+
+int *d_rowIdxA; // COO
+int *d_rowPtrA; // CSR
+int *d_colIdxA;
+cuDoubleComplex *d_A;
+cuDoubleComplex *d_B;
+
+void cudaAx(void* instance, cublasHandle_t cub_handle, cusparseHandle_t cus_handle, 
+    cusparseDnVecDescr_t x, cusparseDnVecDescr_t prod_Ax, const int n_size, const int nz_size, 
+	cusparseOperation_t oper_t)
+{
+	one.x = 1.0; one.y = 0.0;
+	zero.x = 0.0; zero.y = 0.0;
+	// Calculate the product of A*x
+	cusparseSpMV(cus_handle, oper_t, &one, smat_A, x, &zero, prod_Ax, CUDA_C_64F, CUSPARSE_SPMV_ALG_DEFAULT, d_buf);
+    return;
+}
+
+int cudaProgress(void* instance, const cuDoubleComplex* m, const lcg_float converge, 
+	const clcg_para* param, const int n_size, const int nz_size, const int k)
+{
+    if (converge <= param->epsilon) {
+		std::clog << "Iteration-times: " << k << "\tconvergence: " << converge << std::endl;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	std::string inputPath = "data/case_1K_cA";
+	std::string answerPath = "data/case_1K_cB";
+
+	int N, nz;
+	int *rowIdxA, *colIdxA;
+	cuDoubleComplex *A, *b;
+
+	read(inputPath, &N, &nz, &A, &rowIdxA, &colIdxA, &b);
+
+	cuDoubleComplex *ans_x;
+	readAnswer(answerPath, &N, &ans_x);
+
+	std::clog << "N = " << N << std::endl;
+	std::clog << "nz = " << nz << std::endl;
+
+	// Create handles
+	cublasHandle_t cubHandle;
+	cusparseHandle_t cusHandle;
+
+	cublasCreate(&cubHandle);
+	cusparseCreate(&cusHandle);
+
+	// Allocate GPU memory & copy matrix/vector to device
+	cudaMalloc(&d_A, nz * sizeof(cuDoubleComplex));
+	cudaMalloc(&d_rowIdxA, nz * sizeof(int));
+	cudaMalloc(&d_rowPtrA, (N + 1) * sizeof(int));
+	cudaMalloc(&d_colIdxA, nz * sizeof(int));
+	cudaMalloc(&d_B, N * sizeof(cuDoubleComplex));
+
+	cudaMemcpy(d_A, A, nz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_rowIdxA, rowIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_colIdxA, colIdxA, nz * sizeof(int), cudaMemcpyHostToDevice);
+
+	// Convert matrix A from COO format to CSR format
+	cusparseXcoo2csr(cusHandle, d_rowIdxA, nz, N, d_rowPtrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	// Create sparse matrix
+	cusparseCreateCsr(&smat_A, N, N, nz, d_rowPtrA, d_colIdxA, d_A, CUSPARSE_INDEX_32I,
+		CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F);
+
+	// This is just used to get bufferSize;
+	cusparseDnVecDescr_t dvec_tmp;
+	cusparseCreateDnVec(&dvec_tmp, N, d_B, CUDA_C_64F);
+
+	size_t bufferSize_B, bufferSize_B2;
+
+	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B);
+	
+	cusparseSpMV_bufferSize(cusHandle, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE, &one, smat_A,
+		dvec_tmp, &zero, dvec_tmp, CUDA_C_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize_B2);
+	
+	if (bufferSize_B2 > bufferSize_B) bufferSize_B = bufferSize_B2;
+	cudaMalloc(&d_buf, bufferSize_B);
+
+	// Declare an initial solution
+    clcg_para self_para = clcg_default_parameters();
+	self_para.epsilon = 1e-6;
+	self_para.abs_diff = 0;
+
+	int ret;
+	cuDoubleComplex *host_m = new cuDoubleComplex[N];
+
+	// Solve with BICG
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+    ret = clcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, CLCG_BICG);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Solve with BICG_SYM
+	for (size_t i = 0; i < N; i++)
+	{
+		host_m[i].x = 0.0; host_m[i].y = 0.0;	
+	}
+
+    ret = clcg_solver_cuda(cudaAx, cudaProgress, host_m, b, N, nz, &self_para, nullptr, cubHandle, cusHandle, CLCG_BICG_SYM);
+    lcg_error_str(ret);
+
+	std::clog << "Averaged error (compared with ans_x): " << avg_error(host_m, ans_x, N) << std::endl;
+
+	// Free Host memory
+	delete[] A;
+	delete[] rowIdxA;
+	delete[] colIdxA;
+	delete[] b;
+	delete[] ans_x;
+	delete[] host_m;
+
+	// Free Device memory
+	cudaFree(d_A);
+	cudaFree(d_rowIdxA);
+	cudaFree(d_rowPtrA);
+	cudaFree(d_colIdxA);
+	cudaFree(d_B);
+
+	cusparseDestroyDnVec(dvec_tmp);
+	cusparseDestroySpMat(smat_A);
+	cudaFree(d_buf);
+
+	// Free handles
+	cublasDestroy(cubHandle);
+	cusparseDestroy(cusHandle);
+
+	return 0;
+}