initial upload

2025-12-17 10:53:43 +08:00
commit f3f1778f77
308 changed files with 129940 additions and 0 deletions
--- a/include/1d_models.h
+++ b/include/1d_models.h
@@ -0,0 +1,397 @@
+#ifndef ONE_D_MODELS_H
+#define ONE_D_MODELS_H
+
+#include <vector>
+#include "config.h"
+
+
+// ak135  http://rses.anu.edu.au/seismology/ak135/ak135t.html
+// depth(km), P velocity(km/s)
+inline const std::vector<std::vector<CUSTOMREAL>> model_1d_ak135 \
+    {{     0.000,      5.8000},
+     {    20.000,      5.8000},
+     {    20.000,      6.5000},
+     {    35.000,      6.5000},
+     {    35.000,      8.0400},
+     {    77.500,      8.0450},
+     {   120.000,      8.0500},
+     {   165.000,      8.1750},
+     {   210.000,      8.3000},
+     {   210.000,      8.3000},
+     {   260.000,      8.4825},
+     {   310.000,      8.6650},
+     {   360.000,      8.8475},
+     {   410.000,      9.0300},
+     {   410.000,      9.3600},
+     {   460.000,      9.5280},
+     {   510.000,      9.6960},
+     {   560.000,      9.8640},
+     {   610.000,     10.0320},
+     {   660.000,     10.2000},
+     {   660.000,     10.7900},
+     {   710.000,     10.9229},
+     {   760.000,     11.0558},
+     {   809.500,     11.1353},
+     {   859.000,     11.2221},
+     {   908.500,     11.3068},
+     {   958.000,     11.3896},
+     {  1007.500,     11.4705},
+     {  1057.000,     11.5495},
+     {  1106.500,     11.6269},
+     {  1156.000,     11.7026},
+     {  1205.500,     11.7766},
+     {  1255.000,     11.8491},
+     {  1304.500,     11.9200},
+     {  1354.000,     11.9895},
+     {  1403.500,     12.0577},
+     {  1453.000,     12.1245},
+     {  1502.500,     12.1912},
+     {  1552.000,     12.2550},
+     {  1601.500,     12.3185},
+     {  1651.000,     12.3819},
+     {  1700.500,     12.4426},
+     {  1750.000,     12.5031},
+     {  1799.500,     12.5631},
+     {  1849.000,     12.6221},
+     {  1898.500,     12.6804},
+     {  1948.000,     12.7382},
+     {  1997.500,     12.7956},
+     {  2047.000,     12.8526},
+     {  2096.500,     12.9096},
+     {  2146.000,     12.9668},
+     {  2195.500,     13.0222},
+     {  2245.000,     13.0783},
+     {  2294.500,     13.1336},
+     {  2344.000,     13.1894},
+     {  2393.500,     13.2465},
+     {  2443.000,     13.3018},
+     {  2492.500,     13.3585},
+     {  2542.000,     13.4156},
+     {  2591.500,     13.4741},
+     {  2640.000,     13.5312},
+     {  2690.000,     13.5900},
+     {  2740.000,     13.6494},
+     {  2740.000,     13.6494},
+     {  2789.670,     13.6530},
+     {  2839.330,     13.6566},
+     {  2891.500,     13.6602},
+     {  2891.500,      8.0000},
+     {  2939.330,      8.0382},
+     {  2989.660,      8.1283},
+     {  3039.990,      8.2213},
+     {  3090.320,      8.3122},
+     {  3140.660,      8.4001},
+     {  3190.990,      8.4861},
+     {  3241.320,      8.5692},
+     {  3291.650,      8.6496},
+     {  3341.980,      8.7283},
+     {  3392.310,      8.8036},
+     {  3442.640,      8.8761},
+     {  3492.970,      8.9461},
+     {  3543.300,      9.0138},
+     {  3593.640,      9.0792},
+     {  3643.970,      9.1426},
+     {  3694.300,      9.2042},
+     {  3744.630,      9.2634},
+     {  3794.960,      9.3205},
+     {  3845.290,      9.3760},
+     {  3895.620,      9.4297},
+     {  3945.950,      9.4814},
+     {  3996.280,      9.5306},
+     {  4046.620,      9.5777},
+     {  4096.950,      9.6232},
+     {  4147.280,      9.6673},
+     {  4197.610,      9.7100},
+     {  4247.940,      9.7513},
+     {  4298.270,      9.7914},
+     {  4348.600,      9.8304},
+     {  4398.930,      9.8682},
+     {  4449.260,      9.9051},
+     {  4499.600,      9.9410},
+     {  4549.930,      9.9761},
+     {  4600.260,     10.0103},
+     {  4650.590,     10.0439},
+     {  4700.920,     10.0768},
+     {  4801.580,     10.1415},
+     {  4851.910,     10.1739},
+     {  4902.240,     10.2049},
+     {  4952.580,     10.2329},
+     {  5002.910,     10.2565},
+     {  5053.240,     10.2745},
+     {  5103.570,     10.2854},
+     {  5153.500,     10.2890},
+     {  5153.500,     11.0427},
+     {  5204.610,     11.0585},
+     {  5255.320,     11.0718},
+     {  5306.040,     11.0850},
+     {  5356.750,     11.0983},
+     {  5407.460,     11.1166},
+     {  5458.170,     11.1316},
+     {  5508.890,     11.1457},
+     {  5559.600,     11.1590},
+     {  5610.310,     11.1715},
+     {  5661.020,     11.1832},
+     {  5711.740,     11.1941},
+     {  5813.160,     11.2134},
+     {  5863.870,     11.2219},
+     {  5914.590,     11.2295},
+     {  5965.300,     11.2364},
+     {  6016.010,     11.2424},
+     {  6066.720,     11.2477},
+     {  6117.440,     11.2521},
+     {  6168.150,     11.2557},
+     {  6218.860,     11.2586},
+     {  6269.570,     11.2606},
+     {  6320.290,     11.2618},
+     {  6371.000,     11.2622}};
+
+
+// iasp91 model from http://ds.iris.edu/spud/earthmodel/9991809
+inline const std::vector<std::vector<CUSTOMREAL>> model_1d_iasp91 \
+    {{0.00,     5.8000},
+     {1.00,     5.8000},
+     {2.00,     5.8000},
+     {3.00,     5.8000},
+     {4.00,     5.8000},
+     {5.00,     5.8000},
+     {6.00,     5.8000},
+     {7.00,     5.8000},
+     {8.00,     5.8000},
+     {9.00,     5.8000},
+     {10.00,    5.8000},
+     {11.00,    5.8000},
+     {12.00,    5.8000},
+     {13.00,    5.8000},
+     {14.00,    5.8000},
+     {15.00,    5.8000},
+     {16.00,    5.8000},
+     {17.00,    5.8000},
+     {18.00,    5.8000},
+     {19.00,    5.8000},
+     {20.00,    5.8000},
+     {20.00,    6.5000},
+     {21.00,    6.5000},
+     {22.00,    6.5000},
+     {23.00,    6.5000},
+     {24.00,    6.5000},
+     {25.00,    6.5000},
+     {26.00,    6.5000},
+     {27.00,    6.5000},
+     {28.00,    6.5000},
+     {29.00,    6.5000},
+     {30.00,    6.5000},
+     {31.00,    6.5000},
+     {32.00,    6.5000},
+     {33.00,    6.5000},
+     {34.00,    6.5000},
+     {35.00,    6.5000},
+     {35.00,    8.0400},
+     {40.00,    8.0406},
+     {45.00,    8.0412},
+     {50.00,    8.0418},
+     {60.00,    8.0429},
+     {70.00,    8.0441},
+     {80.00,    8.0453},
+     {90.00,    8.0465},
+     {100.00,   8.0476},
+     {110.00,   8.0488},
+     {120.00,   8.0500},
+     {120.00,   8.0500},
+     {130.00,   8.0778},
+     {140.00,   8.1056},
+     {150.00,   8.1333},
+     {160.00,   8.1611},
+     {170.00,   8.1889},
+     {180.00,   8.2167},
+     {190.00,   8.2444},
+     {200.00,   8.2722},
+     {210.00,   8.3000},
+     {210.00,   8.3000},
+     {220.00,   8.3365},
+     {230.00,   8.3730},
+     {240.00,   8.4095},
+     {250.00,   8.4460},
+     {260.00,   8.4825},
+     {270.00,   8.5190},
+     {280.00,   8.5555},
+     {290.00,   8.5920},
+     {300.00,   8.6285},
+     {310.00,   8.6650},
+     {320.00,   8.7015},
+     {330.00,   8.7380},
+     {340.00,   8.7745},
+     {350.00,   8.8110},
+     {360.00,   8.8475},
+     {370.00,   8.8840},
+     {380.00,   8.9205},
+     {390.00,   8.9570},
+     {400.00,   8.9935},
+     {410.00,   9.0300},
+     {410.00,   9.3600},
+     {420.00,   9.3936},
+     {430.00,   9.4272},
+     {440.00,   9.4608},
+     {450.00,   9.4944},
+     {460.00,   9.5280},
+     {470.00,   9.5616},
+     {480.00,   9.5952},
+     {490.00,   9.6288},
+     {500.00,   9.6624},
+     {510.00,   9.6960},
+     {520.00,   9.7296},
+     {530.00,   9.7632},
+     {540.00,   9.7968},
+     {550.00,   9.8304},
+     {560.00,   9.8640},
+     {570.00,   9.8976},
+     {580.00,   9.9312},
+     {590.00,   9.9648},
+     {600.00,   9.9984},
+     {610.00,   10.0320},
+     {620.00,   10.0656},
+     {630.00,   10.0992},
+     {640.00,   10.1328},
+     {650.00,   10.1664},
+     {660.00,   10.2000},
+     {660.00,   10.7900},
+     {670.00,   10.8166},
+     {680.00,   10.8432},
+     {690.00,   10.8697},
+     {700.00,   10.8963},
+     {710.00,   10.9229},
+     {720.00,   10.9495},
+     {730.00,   10.9761},
+     {740.00,   11.0026},
+     {750.00,   11.0292},
+     {760.00,   11.0558},
+     {760.00,   11.0558},
+     {770.00,   11.0738},
+     {780.00,   11.0917},
+     {790.00,   11.1095},
+     {800.00,   11.1272},
+     {900.00,   11.2997},
+     {1000.00,  11.4640},
+     {1100.00,  11.6208},
+     {1200.00,  11.7707},
+     {1300.00,  11.9142},
+     {1400.00,  12.0521},
+     {1500.00,  12.1849},
+     {2000.00,  12.7944},
+     {2500.00,  13.3697},
+     {2700.00,  13.6076},
+     {2740.00,  13.6564},
+     {2740.00,  13.6564},
+     {2750.00,  13.6587},
+     {2800.00,  13.6703},
+     {2850.00,  13.6818},
+     {2889.00,  13.6908},
+     {2889.00,  8.0088},
+     {2900.00,  8.0280},
+     {3000.00,  8.1995},
+     {3100.00,  8.3642},
+     {3200.00,  8.5222},
+     {3300.00,  8.6735},
+     {3400.00,  8.8180},
+     {3500.00,  8.9558},
+     {4000.00,  9.5437},
+     {4500.00,  9.9633},
+     {5153.90,  10.2578},
+     {5153.90,  11.0914},
+     {5500.00,  11.1644},
+     {6000.00,  11.2270},
+     {6371.00,  11.2409}};
+
+
+// fortran test model
+inline const std::vector<std::vector<CUSTOMREAL>> model_1d_prem \
+    = {{    0.00,     5.80000},
+       {   15.00,     5.80000},
+       {   15.00,     6.80000},
+       {   24.40,     6.80000},
+       {   24.40,     8.11061},
+       {   40.00,     8.10119},
+       {   60.00,     8.08907},
+       {   80.00,     8.07688},
+       {  115.00,     8.05540},
+       {  150.00,     8.03370},
+       {  185.00,     8.01180},
+       {  220.00,     7.98970},
+       {  220.00,     8.55896},
+       {  265.00,     8.64552},
+       {  310.00,     8.73209},
+       {  355.00,     8.81867},
+       {  400.00,     8.90522},
+       {  400.00,     9.13397},
+       {  450.00,     9.38990},
+       {  500.00,     9.64588},
+       {  550.00,     9.90185},
+       {  600.00,    10.15782},
+       {  635.00,    10.21203},
+       {  670.00,    10.26622},
+       {  670.00,    10.75131},
+       {  721.00,    10.91005},
+       {  771.00,    11.06557},
+       {  871.00,    11.24490},
+       {  971.00,    11.41560},
+       { 1071.00,    11.57828},
+       { 1171.00,    11.73357},
+       { 1271.00,    11.88209},
+       { 1371.00,    12.02445},
+       { 1471.00,    12.16126},
+       { 1571.00,    12.29316},
+       { 1671.00,    12.42075},
+       { 1771.00,    12.54466},
+       { 1871.00,    12.66550},
+       { 1971.00,    12.78389},
+       { 2071.00,    12.90045},
+       { 2171.00,    13.01579},
+       { 2271.00,    13.13055},
+       { 2371.00,    13.24532},
+       { 2471.00,    13.36074},
+       { 2571.00,    13.47742},
+       { 2671.00,    13.59597},
+       { 2741.00,    13.68041},
+       { 2771.00,    13.68753},
+       { 2871.00,    13.71168},
+       { 2891.00,    13.71660},
+       { 2891.00,     8.06482},
+       { 2971.00,     8.19939},
+       { 3071.00,     8.36019},
+       { 3171.00,     8.51298},
+       { 3271.00,     8.65805},
+       { 3371.00,     8.79573},
+       { 3471.00,     8.92632},
+       { 3571.00,     9.05015},
+       { 3671.00,     9.16752},
+       { 3771.00,     9.27867},
+       { 3871.00,     9.38418},
+       { 3971.00,     9.48409},
+       { 4071.00,     9.57881},
+       { 4171.00,     9.66865},
+       { 4271.00,     9.75393},
+       { 4371.00,     9.83496},
+       { 4471.00,     9.91206},
+       { 4571.00,     9.98554},
+       { 4671.00,    10.05572},
+       { 4771.00,    10.12291},
+       { 4871.00,    10.18743},
+       { 4971.00,    10.24959},
+       { 5071.00,    10.30971},
+       { 5149.50,    10.35568},
+       { 5149.50,    11.02827},
+       { 5171.00,    11.03643},
+       { 5271.00,    11.07249},
+       { 5371.00,    11.10542},
+       { 5471.00,    11.13521},
+       { 5571.00,    11.16186},
+       { 5671.00,    11.18538},
+       { 5771.00,    11.20576},
+       { 5871.00,    11.22301},
+       { 5971.00,    11.23712},
+       { 6071.00,    11.24809},
+       { 6171.00,    11.25593},
+       { 6271.00,    11.26064},
+       { 6371.00,    11.26220}};
+
+
+#endif
--- a/include/config.h
+++ b/include/config.h
@@ -0,0 +1,332 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#include <math.h>
+#include <mpi.h>
+#include <string>
+#include <vector>
+#include <limits>
+#include <iostream>
+#include "version.h"
+
+// custom floating point accuracy
+#ifdef SINGLE_PRECISION
+#define CUSTOMREAL float
+#define MPI_CR MPI_FLOAT
+#else
+#define CUSTOMREAL double
+#define MPI_CR MPI_DOUBLE
+#endif
+
+
+#define MPI_DUMMY_TAG 1000
+
+// version
+inline const std::string TOMOATT_VERSION = std::to_string(PROJECT_VERSION_MAJOR) +
+                                   "." + std::to_string(PROJECT_VERSION_MINOR) +
+                                   "." + std::to_string(PROJECT_VERSION_PATCH);
+
+inline int loc_I, loc_J, loc_K;
+inline int loc_I_vis, loc_J_vis, loc_K_vis;
+inline int loc_I_excl_ghost, loc_J_excl_ghost, loc_K_excl_ghost;
+inline int n_inv_grids;
+inline int n_inv_I_loc, n_inv_J_loc, n_inv_K_loc;
+inline int n_inv_I_loc_ani, n_inv_J_loc_ani, n_inv_K_loc_ani;
+
+// 3d indices to 1d index
+#define I2V(A,B,C) ((C)*loc_I*loc_J + (B)*loc_I + A)
+inline void V2I(const int& ijk, int& i, int& j, int& k) {
+    k = ijk / (loc_I * loc_J);
+    j = (ijk - k * loc_I * loc_J) / loc_I;
+    i = ijk - k * loc_I * loc_J - j * loc_I;
+}
+
+#define I2V_INV_GRIDS(A,B,C,D) ((D)*n_inv_I_loc*n_inv_J_loc*n_inv_K_loc + (C)*n_inv_I_loc*n_inv_J_loc + (B)*n_inv_I_loc + A)
+#define I2V_INV_KNL(A,B,C)     ((C)*n_inv_I_loc*n_inv_J_loc + (B)*n_inv_I_loc + A)
+#define I2V_INV_GRIDS_1DK(A,B)    ((B)*n_inv_K_loc + (A))
+#define I2V_INV_GRIDS_2DJ(A,B,C)  ((C)*n_inv_J_loc*n_inv_K_loc + (B)*n_inv_J_loc + (A))
+#define I2V_INV_GRIDS_2DI(A,B,C)  ((C)*n_inv_I_loc*n_inv_K_loc + (B)*n_inv_I_loc + (A))
+
+#define I2V_INV_ANI_GRIDS(A,B,C,D) ((D)*n_inv_I_loc_ani*n_inv_J_loc_ani*n_inv_K_loc_ani + (C)*n_inv_I_loc_ani*n_inv_J_loc_ani + (B)*n_inv_I_loc_ani + A)
+#define I2V_INV_ANI_KNL(A,B,C)     ((C)*n_inv_I_loc_ani*n_inv_J_loc_ani + (B)*n_inv_I_loc_ani + A)
+#define I2V_INV_ANI_GRIDS_1DK(A,B)    ((B)*n_inv_K_loc_ani + (A))
+#define I2V_INV_ANI_GRIDS_1DJ(A,B,C)  ((C)*n_inv_J_loc_ani*n_inv_K_loc_ani + (B)*n_inv_J_loc_ani + (A))
+#define I2V_INV_ANI_GRIDS_1DI(A,B,C)  ((C)*n_inv_I_loc_ani*n_inv_K_loc_ani + (B)*n_inv_I_loc_ani + (A))
+
+#define I2V_INV_GRIDS_1D_GENERIC(A,B,C_ninv) ((B)*C_ninv + (A))
+#define I2V_INV_GRIDS_2D_GENERIC(A,B,C,D_ninv,E_inv)  ((C)*D_ninv*E_inv + (B)*D_ninv + (A))
+
+#define I2V_EXCL_GHOST(A,B,C)  ((C)* loc_I_excl_ghost   * loc_J_excl_ghost +    (B)* loc_I_excl_ghost    + A)
+#define I2V_VIS(A,B,C)         ((C)* loc_I_vis   * loc_J_vis +    (B)* loc_I_vis    + A)
+#define I2V_3D(A,B,C)          ((C)* loc_I_excl_ghost*loc_J_excl_ghost + (B)* loc_I_excl_ghost + A)
+#define I2V_ELM_CONN(A,B,C)    ((C)*(loc_I_vis-1)*(loc_J_vis-1) + (B)*(loc_I_vis-1) + A)
+#define I2VLBFGS(A,B,C,D)      ((A)*loc_I*loc_J*loc_K + (D)*loc_I*loc_J + (C)*loc_I + B)
+
+// 2d indices to 1d index
+#define IJ2V(A,B,C) (C*loc_I*loc_J + (B)*loc_I + A)
+#define JK2V(A,B,C) (C*loc_J*loc_K + (B)*loc_J + A)
+#define IK2V(A,B,C) (C*loc_I*loc_K + (B)*loc_I + A)
+
+inline const CUSTOMREAL eps    = 1e-12;
+inline const CUSTOMREAL epsAdj = 1e-6;
+
+inline bool isZero(CUSTOMREAL x) {
+    return fabs(x) < eps;
+}
+
+inline bool isNegative(CUSTOMREAL x) {
+    return (x) < -eps;
+}
+
+inline bool isPositive(CUSTOMREAL x) {
+    return (x) > eps;
+}
+
+inline bool isZeroAdj(CUSTOMREAL x) {
+    return fabs(x) < epsAdj;
+}
+
+// constants
+inline const CUSTOMREAL PI        = 3.14159265358979323846264338327950288;
+inline const CUSTOMREAL DEG2RAD   = PI/180.0;
+inline const CUSTOMREAL RAD2DEG   = 180.0/PI;
+inline const CUSTOMREAL M2KM      = 0.001;
+inline const CUSTOMREAL _0_CR     = 0.0;
+inline const CUSTOMREAL _0_5_CR   = 0.5;
+inline const CUSTOMREAL _1_CR     = 1.0;
+inline const CUSTOMREAL _1_5_CR   = 1.5;
+inline const CUSTOMREAL _2_CR     = 2.0;
+inline const CUSTOMREAL _3_CR     = 3.0;
+inline const CUSTOMREAL _4_CR     = 4.0;
+inline const CUSTOMREAL _8_CR     = 8.0;
+inline const CUSTOMREAL TAU_INITIAL_VAL = 1.0;
+inline const CUSTOMREAL TAU_INF_VAL = 20.0;
+inline const int        SWEEPING_COEFF  = 1.0; // coefficient for calculationg sigr/sigt/sigp for cpu
+
+// ASCII output precision
+typedef std::numeric_limits< CUSTOMREAL > dbl;
+inline const int        ASCII_OUTPUT_PRECISION = dbl::max_digits10;
+
+// radious of the earth in km
+//inline CUSTOMREAL R_earth = 6378.1370;
+inline CUSTOMREAL       R_earth        = 6371.0; // for compatibility with fortran code
+inline const CUSTOMREAL GAMMA          = 0.0;
+inline const CUSTOMREAL r_kermel_mask  = 40.0;
+inline CUSTOMREAL       step_length_init = 0.01; // update step size limit
+inline CUSTOMREAL       step_length_init_sc = 0.001; // update step size limit (for station correction)
+inline CUSTOMREAL       step_length_decay = 0.9;
+inline CUSTOMREAL       step_length_down = 0.5;
+inline CUSTOMREAL       step_length_up = 1.2;
+inline CUSTOMREAL       step_length_lbfgs;
+inline int              step_method         = 1; // 0：modulate according to obj, 1: modulate according to gradient direction.
+inline CUSTOMREAL       step_length_gradient_angle = 120.0;
+inline const int        OBJ_DEFINED         = 0;
+inline const int        GRADIENT_DEFINED    = 1;
+
+// halve steping params
+inline const CUSTOMREAL HALVE_STEP_RATIO = 0.7;
+inline const CUSTOMREAL MAX_DIFF_RATIO_VOBJ = 0.8; // maximum difference ratio between vobj_t+1 and vobj_t
+inline const CUSTOMREAL HALVE_STEP_RESTORAION_RATIO = 0.7; // no restoration if == HALVE_STEP_RATIO
+
+// RUN MODE TYPE FLAG
+inline const int ONLY_FORWARD        = 0;
+inline const int DO_INVERSION        = 1;
+inline const int SRC_RELOCATION      = 2;
+inline const int INV_RELOC           = 3;
+inline const int ONED_INVERSION      = 4;  
+inline const int TELESEIS_PREPROCESS = -1; // hiden function
+
+// SWEEPING TYPE FLAG
+inline const int SWEEP_TYPE_LEGACY = 0;
+inline const int SWEEP_TYPE_LEVEL  = 1;
+inline      bool hybrid_stencil_order = false; // if true, code at first run 1st order, then change to 3rd order (Dong Cui 2020)
+
+// STENCIL TYPE
+inline const int NON_UPWIND = 0;
+inline const int UPWIND     = 1;
+
+// convert depth <-> radius
+inline CUSTOMREAL depth2radius(CUSTOMREAL depth) {
+    return R_earth - depth;
+}
+inline CUSTOMREAL radius2depth(CUSTOMREAL radius) {
+    return R_earth - radius;
+}
+
+// parallel calculation parameters
+// here we use so called "inline variables" for defining global variables
+inline int ngrid_i     = 0; // number of grid points in i direction
+inline int ngrid_j     = 0; // number of grid points in j direction
+inline int ngrid_k     = 0; // number of grid points in k direction
+//inline int ngrid_i_inv = 0; // number of inversion grid points in i direction # DUPLICATED with n_inv_I_loc as all the subdomains have the same number of inversion grid points
+//inline int ngrid_j_inv = 0; // number of inversion grid points in j direction # DUPLICATED with n_inv_J_loc
+//inline int ngrid_k_inv = 0; // number of inversion grid points in k direction # DUPLICATED with n_inv_K_loc
+//inline int ngrid_i_inv_ani = 0; // number of inversion grid points in i direction for anisotropy # DUPLICATED with n_inv_I_loc_ani
+//inline int ngrid_j_inv_ani = 0; // number of inversion grid points in j direction for anisotropy # DUPLICATED with n_inv_J_loc_ani
+//inline int ngrid_k_inv_ani = 0; // number of inversion grid points in k direction for anisotropy # DUPLICATED with n_inv_K_loc_ani
+
+// inversion grid type flags
+#define INV_GRID_REGULAR 0
+#define INV_GRID_FLEX    1
+#define INV_GRID_TRAPE   2
+
+// mpi parameters
+inline int      world_nprocs;     // total number of processes (all groups)
+inline int      world_rank;       // mpi rank of this process  (all groups)
+inline int      sim_nprocs;       // number of processes in this simulation group
+inline int      sim_rank;         // mpi rank of this process in this simulation group
+inline int      inter_sim_rank;
+inline int      sub_nprocs;       // total number of processes
+inline int      sub_rank;         // mpi rank of this process
+inline int      inter_sub_rank;   // mpi rank of this process in the inter-subdomain communicator
+inline int      inter_sub_nprocs; // number of processes in the inter-subdomain communicator
+inline int      nprocs;           // = n subdomains
+inline int      myrank;           // = id subdomain
+inline MPI_Comm sim_comm, inter_sim_comm, sub_comm, inter_sub_comm; // mpi communicator for simulation, inter-simulation, subdomain, and inter subdomains
+inline int      n_sims           = 1; // number of mpi groups for simultaneous runs
+inline int      n_procs_each_sim = 1; // number of processes in each simulation group
+inline int      n_subdomains     = 1; // number of subdomains
+inline int      n_subprocs       = 1; // number of sub processes in each subdomain
+inline int      ndiv_i           = 1; // number of divisions in x direction
+inline int      ndiv_j           = 1; // number of divisions in y direction
+inline int      ndiv_k           = 1; // number of divisions in z direction
+inline int      id_sim           = 0; // simultaneous run id  (not equal to src id)                 (parallel level 1)
+inline int      id_subdomain     = 0; // subdomain id                                               (parallel level 2)
+inline bool     subdom_main      = false; // true if this process is main process in subdomain      (parallel level 3)
+
+// flags for explaining the process's role
+inline bool proc_read_srcrec = false;  // true if this process is reading source file
+inline bool proc_store_srcrec = false; // true if this process is storing srcrec file
+
+// MNMN stop using these global variable for avoiding misleadings during the sources' iteration loop
+//inline int      id_sim_src       = 0; // id of current target source
+//inline std::string name_sim_src  = "unknown";   //name of current target source
+
+// rule to distribute source to simultaneous group
+inline int select_id_sim_for_src(const int& i_src, const int& n_sims){
+    return i_src % n_sims;
+}
+
+// read input yaml file
+inline std::string input_file="input_params.yaml";
+// output directory name
+inline std::string output_dir="./OUTPUT_FILES/";
+// output file format
+#define OUTPUT_FORMAT_HDF5 11111
+#define OUTPUT_FORMAT_ASCII 22222
+//#define OUTPUT_FORMAT_BINARY 33333 //#TODO: add
+inline int output_format = OUTPUT_FORMAT_HDF5; // 0 - ascii, 1 - hdf5, 2 - binary
+
+// HDF5 io mode (independent or collective)
+#ifdef USE_HDF5_IO_COLLECTIVE
+#define HDF5_IO_MODE H5FD_MPIO_COLLECTIVE
+#else
+#define HDF5_IO_MODE H5FD_MPIO_INDEPENDENT
+#endif
+
+// smooth parameters
+inline int        smooth_method = 0; // 0: multi grid parametrization, 1: laplacian smoothing
+inline CUSTOMREAL smooth_lp = 1.0;
+inline CUSTOMREAL smooth_lt = 1.0;
+inline CUSTOMREAL smooth_lr = 1.0;
+inline CUSTOMREAL regul_lp  = 1.0;
+inline CUSTOMREAL regul_lt  = 1.0;
+inline CUSTOMREAL regul_lr  = 1.0;
+inline const int  GRADIENT_DESCENT    = 0;
+inline const int  HALVE_STEPPING_MODE = 1;
+inline const int  LBFGS_MODE          = 2;
+inline int        optim_method        = 0; // 0: gradient descent, 1: halve_stepping, 2: LBFGS
+inline const CUSTOMREAL wolfe_c1     = 1e-4;
+inline const CUSTOMREAL wolfe_c2     = 0.9;
+inline const int        Mbfgs        = 5;            // number of gradients/models stored in memory
+inline CUSTOMREAL       regularization_weight = 0.5; // regularization weight
+inline int              max_sub_iterations    = 20;  // maximum number of sub-iterations
+inline const CUSTOMREAL LBFGS_RELATIVE_step_length = 3.0; // relative step size for the second and later iteration
+inline CUSTOMREAL       Kdensity_coe  = 0.0;          // a preconditioner \in [0,1], kernel -> kernel / pow(kernel density, Kdensity_coe)
+// variables for test
+inline bool if_test = false;
+
+// verboose mode
+inline bool if_verbose = false;
+//inline bool if_verbose = true;
+
+// if use gpu
+inline bool use_gpu;
+
+// total number of sources in the srcrec file
+inline int nsrc_total = 0;
+
+// flag if common receiver double difference data is used
+inline bool src_pair_exists = false;
+
+
+// weight of different typs of data
+inline CUSTOMREAL abs_time_local_weight    = 1.0;    // weight of absolute traveltime data for local earthquake,                        default: 1.0
+inline CUSTOMREAL cr_dif_time_local_weight = 1.0;    // weight of common receiver differential traveltime data for local earthquake,    default: 1.0
+inline CUSTOMREAL cs_dif_time_local_weight = 1.0;    // weight of common source differential traveltime data for local earthquake,      default: 1.0    (not ready)
+inline CUSTOMREAL teleseismic_weight       = 1.0;    // weight of teleseismic data                                                      default: 1.0    (not ready)
+inline CUSTOMREAL abs_time_local_weight_reloc    = 1.0;    // weight of absolute traveltime data for local earthquake,                        default: 1.0
+inline CUSTOMREAL cs_dif_time_local_weight_reloc = 1.0;    // weight of common source differential traveltime data for local earthquake,      default: 1.0
+inline CUSTOMREAL cr_dif_time_local_weight_reloc = 1.0;    // weight of common receiver differential traveltime data for local earthquake,    default: 1.0
+inline CUSTOMREAL teleseismic_weight_reloc       = 1.0;    // weight of teleseismic data                                                      default: 1.0    (not ready)
+
+// misfit balance
+inline bool       balance_data_weight      = false;    // add the weight to normalize the initial objective function of different types of data. 1 for yes and 0 for no
+inline CUSTOMREAL total_abs_local_data_weight    = 0.0;
+inline CUSTOMREAL total_cr_dif_local_data_weight = 0.0;
+inline CUSTOMREAL total_cs_dif_local_data_weight = 0.0;
+inline CUSTOMREAL total_teleseismic_data_weight  = 0.0;
+inline bool       balance_data_weight_reloc      = false;    // add the weight to normalize the initial objective function of different types of data for relocation. 1 for yes and 0 for no
+inline CUSTOMREAL total_abs_local_data_weight_reloc    = 0.0;
+inline CUSTOMREAL total_cr_dif_local_data_weight_reloc = 0.0;
+inline CUSTOMREAL total_cs_dif_local_data_weight_reloc = 0.0;
+inline CUSTOMREAL total_teleseismic_data_weight_reloc  = 0.0;
+
+// 2d solver parameters for teleseismic data
+// use fixed domain size for all 2d simulations in teleseismic data
+inline       CUSTOMREAL rmin_2d             = 3370.5;
+inline       CUSTOMREAL rmax_2d             = 6471.5;
+inline       CUSTOMREAL tmin_2d             = -5.0/180.0*PI;
+inline       CUSTOMREAL tmax_2d             = 120.0/180.0*PI;
+inline       CUSTOMREAL dr_2d               = 2.0;
+inline       CUSTOMREAL dt_2d               = PI/1000.0;
+inline const CUSTOMREAL TOL_2D_SOLVER       = 1e-5;
+inline const CUSTOMREAL MAX_ITER_2D_SOLVER  = 4000;
+inline const CUSTOMREAL SWEEPING_COEFF_TELE = 1.05;        // coefficient for calculationg sigr/sigt/sigp
+inline const int        N_LAYER_SRC_BOUND   = 1;           // number of layers for source boundary
+inline       CUSTOMREAL DIST_SRC_DDT        = 2.5*DEG2RAD; // distance threshold of two stations
+inline const std::string OUTPUT_DIR_2D      = "/2D_TRAVEL_TIME_FIELD/"; // output directory for 2d solver
+
+// earthquake relocation
+inline CUSTOMREAL       step_length_src_reloc       = 0.01;  // step length for source relocation
+inline CUSTOMREAL       step_length_decay_src_reloc = 0.9;
+inline int              N_ITER_MAX_SRC_RELOC        = 501;  // max iteration for source location
+inline CUSTOMREAL       TOL_SRC_RELOC               = 1e-3; // threshold of the norm of gradient for stopping single earthquake location
+inline const CUSTOMREAL TOL_step_length             = 1e-4; // threshold of the max step size for stopping single earthquake location
+inline CUSTOMREAL       rescaling_dep               = 10.0;
+inline CUSTOMREAL       rescaling_lat               = 5.0;
+inline CUSTOMREAL       rescaling_lon               = 5.0;
+inline CUSTOMREAL       rescaling_ortime            = 0.5;
+inline CUSTOMREAL       max_change_dep              = 10.0; // default: max change is 10 km
+inline CUSTOMREAL       max_change_lat              = 5.0;  // default: max change is 5 km
+inline CUSTOMREAL       max_change_lon              = 5.0;  // default: max change is 5 km
+inline CUSTOMREAL       max_change_ortime           = 0.5;  // default: max change is 0.5 s
+inline bool             ortime_local_search         = true;
+inline int              min_Ndata_reloc             = 4;    // if an earthquake is recorded by less than <min_Ndata> times, relocation is not allowed.
+
+// inversion strategy parameters
+inline int inv_mode                     = 1;
+inline int model_update_N_iter          = 1;
+inline int relocation_N_iter            = 1;
+inline int max_loop_mode0               = 10;
+inline int max_loop_mode1               = 10;
+
+// INV MODE TYPE FLAG
+inline const int ITERATIVE        = 0;
+inline const int SIMULTANEOUS     = 1;
+
+// source receiver weight calculation
+inline CUSTOMREAL ref_value = 1.0; // reference value for source receiver weight calculation
+inline std::string output_file_weight = "srcrec_weight.txt"; // output file name for source receiver weight calculation
+
+#endif // CONFIG_H
--- a/include/eikonal_solver_2d.h
+++ b/include/eikonal_solver_2d.h
@@ -0,0 +1,79 @@
+#ifndef EIKONAL_SOLVER_2D_H
+#define EIKONAL_SOLVER_2D_H
+
+#include <fstream>
+#include <string>
+#include <iostream>
+
+#include "utils.h"
+#include "input_params.h"
+#include "grid.h"
+#include "1d_models.h"
+#include "mpi_funcs.h"
+#include "io.h"
+
+
+class PlainGrid {
+
+public:
+    PlainGrid(Source& src, InputParams& IP);
+    ~PlainGrid();
+
+    void run_iteration(InputParams& IP);
+    void run_iteration_upwind(InputParams& IP);
+
+//private:
+    // grid parameters
+    CUSTOMREAL src_r, src_t, src_p;
+    CUSTOMREAL src_t_dummy;
+    CUSTOMREAL rmin_3d, rmax_3d, tmin_3d, tmax_3d, pmin_3d, pmax_3d;
+    CUSTOMREAL max_degree;
+    int nr_2d, nt_2d;
+
+    // for eikonal solver
+    int count_cand;
+    int ii,ii_nr,ii_n2r,ii_pr,ii_p2r,ii_nt,ii_n2t,ii_pt,ii_p2t;
+    CUSTOMREAL ar,br,at,bt,ar1,ar2,at1,at2,br1,br2,bt1,bt2;
+    CUSTOMREAL eqn_a, eqn_b, eqn_c, eqn_Delta;
+    CUSTOMREAL tmp_tau, T_r, T_t, charact_r, charact_t;
+    bool is_causality;
+    std::vector<CUSTOMREAL> canditate = std::vector<CUSTOMREAL>(60);
+
+    // arrays
+    // azimuth
+    CUSTOMREAL *azimuth_2d;
+    // epicentral distance
+    CUSTOMREAL *epicentral_distance_2d;
+    // activated boundaries
+    //bool *activated_boundaries;
+    // grid
+    CUSTOMREAL *r_2d, *t_2d; // coordinates
+    CUSTOMREAL *fun_2d, *fac_a_2d, *fac_b_2d, *u_2d, *T_2d; // functions
+    CUSTOMREAL *T0v_2d, *T0r_2d, *T0t_2d, *tau_2d, *tau_old_2d;
+    bool *is_changed_2d;
+
+    // member functions
+    void allocate_arrays();
+    void deallocate_arrays();
+    void load_1d_model(std::string);
+    void select_1d_model(std::string);
+    void calculate_stencil_2d(int&, int&);
+    void calculate_stencil_upwind_2d(int&, int&);
+    CUSTOMREAL eps_2d = 1e-12;
+
+private:
+
+    // # TODO: make id model type selectable by input file
+    // 1D model used for 2D solver
+    std::vector<std::vector<CUSTOMREAL>> model_1d_ref = model_1d_prem;
+
+
+};
+
+void prepare_teleseismic_boundary_conditions(InputParams&, Grid&, IO_utils&);
+void run_2d_solver(InputParams&, Source&, IO_utils&);
+void interp2d(PlainGrid&, CUSTOMREAL, CUSTOMREAL, CUSTOMREAL&);
+void load_2d_traveltime(InputParams&, Source&, Grid&, IO_utils&);
+std::string get_2d_tt_filename(const std::string&, Source&);
+
+#endif // EIKONAL_SOLVER_2D_H
--- a/include/grid.fwd.h
+++ b/include/grid.fwd.h
@@ -0,0 +1,2 @@
+#pragma once
+class Grid;
--- a/include/grid.h
+++ b/include/grid.h
@@ -0,0 +1,445 @@
+#ifndef GRID_H
+#define GRID_H
+
+// to handle circular dependency
+#pragma once
+#include "grid.fwd.h"
+#include "source.fwd.h"
+#include "io.fwd.h"
+
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <cstring>
+#include <iomanip>
+
+#include "utils.h"
+#include "config.h"
+#include "mpi_funcs.h"
+#include "input_params.h"
+#include "source.h"
+#include "io.h"
+#include "inv_grid.h"
+
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#include <cuda.h>
+#endif
+
+class Grid {
+public:
+    Grid(InputParams&, IO_utils&);
+    ~Grid();
+    //
+    // setter
+    //
+    // set physical parameters
+    void setup_grid_params(InputParams&, IO_utils&); // setup grid parameters
+    // set factors
+    void setup_factors(Source &);
+    // calculate initial fields T0 T0r T0t T0p and initialize tau
+    void initialize_fields(Source &, InputParams&);
+    // calculate initial fields T0 T0r T0t T0p and initialize tau for teleseismic source
+    void initialize_fields_teleseismic();
+    // calculate L1 and Linf diff (sum of value change on the nodes)
+    void calc_L1_and_Linf_diff(CUSTOMREAL&, CUSTOMREAL&);
+    // calculate L1 and Linf diff for teleseismic source (sum of value change on the nodes)
+    void calc_L1_and_Linf_diff_tele(CUSTOMREAL&, CUSTOMREAL&);
+    // FOR TEST: calculate L1 and Linf error between analytic solution and current T
+    void calc_L1_and_Linf_error(CUSTOMREAL&, CUSTOMREAL&);
+    // calculate L1 and Linf diff for adjoint field
+    void calc_L1_and_Linf_diff_adj(CUSTOMREAL&, CUSTOMREAL&);
+    // calculate L1 and Linf diff for density of adjoint field
+    void calc_L1_and_Linf_diff_adj_density(CUSTOMREAL&, CUSTOMREAL&);
+
+    // send and receive data to/from other subdomains
+    void send_recev_boundary_data(CUSTOMREAL*);
+    void prepare_boundary_data_to_send(CUSTOMREAL*);
+    void assign_received_data_to_ghost(CUSTOMREAL*);
+    void send_recev_boundary_data_av(CUSTOMREAL*);
+    void assign_received_data_to_ghost_av(CUSTOMREAL*);
+    // send and receive data to/from the neighbor subdomains which contact on a line or point
+    void send_recev_boundary_data_kosumi(CUSTOMREAL*);
+    void prepare_boundary_data_to_send_kosumi(CUSTOMREAL*);
+    void assign_received_data_to_ghost_kosumi(CUSTOMREAL*);
+
+    // inversion methods
+    InvGrid* inv_grid; // inversion grid definitions
+
+    // check model discontinuity
+    void check_velocity_discontinuity();
+
+    void reinitialize_abcf();   // reinitialize factors
+    void rejuvenate_abcf();     // reinitialize factors for earthquake relocation
+    void initialize_kernels();  // fill 0 to kernels
+
+    //
+    // getters
+    //
+
+    int get_ngrid_total_excl_ghost(){return loc_nnodes;};
+    int get_nelms_total_excl_ghost(){return loc_nelms;};
+    int get_offset_nnodes()         {return offset_nnodes;};
+    int get_offset_elms()           {return offset_nelms;};
+    void get_offsets_3d(int* arr) {
+        arr[0] = offset_k;
+        arr[1] = offset_j;
+        arr[2] = offset_i;
+    };
+
+    int get_ngrid_total_vis()  {return loc_nnodes_vis;};
+    int get_nelms_total_vis()  {return loc_nelms_vis;};
+    int get_offset_nnodes_vis(){return offset_nnodes_vis;};
+    int get_offset_elms_vis()  {return offset_nelms_vis;};
+
+    // return dimensions of node coordinates arra and connectivity array
+    std::array<int,2> get_dim_nodes_coords() {return {offset_nnodes,3};};
+    std::array<int,2> get_dim_elms_conn()    {return {offset_nelms,9};};
+
+    // return node coordinates array and connectivity array
+    int*        get_elms_conn()     {return elms_conn;};
+    CUSTOMREAL* get_nodes_coords_x(){return x_loc_3d;}; // lon
+    CUSTOMREAL* get_nodes_coords_y(){return y_loc_3d;}; // lat
+    CUSTOMREAL* get_nodes_coords_z(){return z_loc_3d;}; // r
+    CUSTOMREAL* get_nodes_coords_p(){return p_loc_3d;}; // lon
+    CUSTOMREAL* get_nodes_coords_t(){return t_loc_3d;}; // lat
+    CUSTOMREAL* get_nodes_coords_r(){return r_loc_3d;}; // r
+    int*        get_proc_dump()     {return my_proc_dump;}; // array storing the process ids
+
+    CUSTOMREAL* get_true_solution(){return get_array_for_vis(u_loc,     false);}; // true solution
+    CUSTOMREAL* get_fun()          {return get_array_for_vis(fun_loc,   false);}; //
+    CUSTOMREAL* get_xi()           {return get_array_for_vis(xi_loc,    false);}; //
+    CUSTOMREAL* get_eta()          {return get_array_for_vis(eta_loc,   false);}; //
+    CUSTOMREAL* get_a()            {return get_array_for_vis(fac_a_loc, false);}; //
+    CUSTOMREAL* get_b()            {return get_array_for_vis(fac_b_loc, false);}; //
+    CUSTOMREAL* get_c()            {return get_array_for_vis(fac_c_loc, false);}; //
+    CUSTOMREAL* get_f()            {return get_array_for_vis(fac_f_loc, false);}; //
+    CUSTOMREAL* get_vel()          {return get_array_for_vis(fun_loc,   true);}; // true velocity field
+    CUSTOMREAL* get_T0v()          {return get_array_for_vis(T0v_loc,   false);}; // initial T0
+    CUSTOMREAL* get_u()            {return get_array_for_vis(u_loc,     false);}; // current solution
+    CUSTOMREAL* get_tau()          {return get_array_for_vis(tau_loc,   false);}; // current tau
+    CUSTOMREAL* get_T()            {return get_array_for_vis(T_loc,     false);};
+    CUSTOMREAL* get_residual()     { calc_residual();
+                                     return get_array_for_vis(u_loc,   false);}; // calculate residual (T_loc over written!!)
+    CUSTOMREAL* get_Tadj()         {return get_array_for_vis(Tadj_loc, false);}; // adjoint solution
+    CUSTOMREAL* get_Ks()           {return get_array_for_vis(Ks_loc,   false);}; // Ks
+    CUSTOMREAL* get_Kxi()          {return get_array_for_vis(Kxi_loc,         false);}; // Kxi
+    CUSTOMREAL* get_Keta()         {return get_array_for_vis(Keta_loc,        false);}; // Keta
+    CUSTOMREAL* get_Ks_density()   {return get_array_for_vis(Ks_density_loc,    false);}; // Ks_density
+    CUSTOMREAL* get_Kxi_density()  {return get_array_for_vis(Kxi_density_loc,   false);}; // Kxi_density
+    CUSTOMREAL* get_Keta_density() {return get_array_for_vis(Keta_density_loc,  false);}; // Keta_density
+    CUSTOMREAL* get_Ks_update()    {return get_array_for_vis(Ks_update_loc,   false);}; // Ks_update
+    CUSTOMREAL* get_Kxi_update()   {return get_array_for_vis(Kxi_update_loc,  false);}; // Kxi_update
+    CUSTOMREAL* get_Keta_update()  {return get_array_for_vis(Keta_update_loc, false);}; // Keta_update
+    CUSTOMREAL* get_Ks_density_update()     {return get_array_for_vis(Ks_density_update_loc,false);}; // Ks_density_update
+    CUSTOMREAL* get_Kxi_density_update()    {return get_array_for_vis(Kxi_density_update_loc,false);}; // Kxi_density_update
+    CUSTOMREAL* get_Keta_density_update()   {return get_array_for_vis(Keta_density_update_loc,false);}; // Keta_density_update
+
+    CUSTOMREAL* get_Ks_descent_dir() {return get_array_for_vis(Ks_descent_dir_loc, false);}; // Ks_descent_dir
+    CUSTOMREAL* get_Kxi_descent_dir(){return get_array_for_vis(Kxi_descent_dir_loc,false);}; // Kxi_descent_dir
+    CUSTOMREAL* get_Keta_descent_dir(){return get_array_for_vis(Keta_descent_dir_loc,false);}; // Keta_descent_dir
+
+    // get physical parameters
+    CUSTOMREAL get_r_min()       {return r_min;};
+    CUSTOMREAL get_r_max()       {return r_max;};
+    CUSTOMREAL get_lat_min()     {return lat_min;};
+    CUSTOMREAL get_lat_max()     {return lat_max;};
+    CUSTOMREAL get_lon_min()     {return lon_min;};
+    CUSTOMREAL get_lon_max()     {return lon_max;};
+    CUSTOMREAL get_delta_lon()   {return dlon;};
+    CUSTOMREAL get_delta_lat()   {return dlat;};
+    CUSTOMREAL get_delta_r()     {return dr;};
+    CUSTOMREAL get_r_min_loc_excl_gl()   {return r_loc_1d[k_start_loc];};
+    CUSTOMREAL get_r_max_loc_excl_gl()   {return r_loc_1d[k_end_loc];};
+    CUSTOMREAL get_lat_min_loc_excl_gl() {return t_loc_1d[j_start_loc];};
+    CUSTOMREAL get_lat_max_loc_excl_gl() {return t_loc_1d[j_end_loc];};
+    CUSTOMREAL get_lon_min_loc_excl_gl() {return p_loc_1d[i_start_loc];};
+    CUSTOMREAL get_lon_max_loc_excl_gl() {return p_loc_1d[i_end_loc];};
+    CUSTOMREAL get_r_min_loc()   {return r_loc_1d[0];};
+    CUSTOMREAL get_r_max_loc()   {return r_loc_1d[loc_K-1];};
+    CUSTOMREAL get_lat_min_loc() {return t_loc_1d[0];};
+    CUSTOMREAL get_lat_max_loc() {return t_loc_1d[loc_J-1];};
+    CUSTOMREAL get_lon_min_loc() {return p_loc_1d[0];};
+    CUSTOMREAL get_lon_max_loc() {return p_loc_1d[loc_I-1];};
+    CUSTOMREAL get_r_by_index(int i) {return r_loc_1d[i];};
+    CUSTOMREAL get_lat_by_index(int j) {return t_loc_1d[j];};
+    CUSTOMREAL get_lon_by_index(int i) {return p_loc_1d[i];};
+
+    int get_offset_i()           {return offset_i-i_start_loc;}; // return offset index (global index of the first node)
+    int get_offset_j()           {return offset_j-j_start_loc;};
+    int get_offset_k()           {return offset_k-k_start_loc;};
+    int get_offset_i_excl_gl()           {return offset_i;}; // return offset index
+    int get_offset_j_excl_gl()           {return offset_j;};
+    int get_offset_k_excl_gl()           {return offset_k;};
+
+    // return index of the first node excluding ghost layer
+    int get_k_start_loc()        {return k_start_loc;};
+    int get_k_end_loc()          {return k_end_loc;};
+    int get_j_start_loc()        {return j_start_loc;};
+    int get_j_end_loc()          {return j_end_loc;};
+    int get_i_start_loc()        {return i_start_loc;};
+    int get_i_end_loc()          {return i_end_loc;};
+
+    // copy tau to tau old
+    void tau2tau_old();
+    // copy T to tau old
+    void T2tau_old();
+    // copy tau to Tadj
+    void update_Tadj();
+    // copy tau to Tadj
+    void update_Tadj_density();
+    // back up fun xi eta
+    void back_up_fun_xi_eta_bcf();
+    // restore fun xi eta
+    void restore_fun_xi_eta_bcf();
+
+    // write out inversion grid file
+    void write_inversion_grid_file();
+
+private:
+
+    //
+    // member variables
+    //
+    // i: longitude, phi,
+    // j: latitude,  theta,
+    // k: radius,    r
+
+    // starting node id for each direction expect the boundary
+    int i_start_loc=0, j_start_loc=0, k_start_loc=0;
+    // end node id for each direction expect the boundary
+    int i_end_loc=0, j_end_loc=0, k_end_loc=0;
+
+    // neighbors domain_id (-1 if no neighbor)
+    // order of directions: -i,+i,-j,+j,-k,+k
+    std::vector<int> neighbors_id{-1,-1,-1,-1,-1,-1};
+    // neighbors domain_id line and point contact
+    // ij plane nn, np, pn, pp
+    std::vector<int> neighbors_id_ij{-1,-1,-1,-1};
+    // jk plane nn, np, pn, pp
+    std::vector<int> neighbors_id_jk{-1,-1,-1,-1};
+    // ik plane  nn, np, pn, pp
+    std::vector<int> neighbors_id_ik{-1,-1,-1,-1};
+    // ijk plane nnn, nnp, npn, npp, pnn, pnp, ppn, ppp
+    std::vector<int> neighbors_id_ijk{-1,-1,-1,-1,-1,-1,-1,-1};
+
+    // layer id of this domain
+    int domain_i, domain_j, domain_k;
+
+    // number of points on each boundary surface
+    int n_grid_bound_i, n_grid_bound_j, n_grid_bound_k;
+
+    // mpi parameters
+    int offset_nnodes=0, offset_nelms=0;
+    int offset_i=0, offset_j=0, offset_k=0; // offset of this domain in the global grid
+
+    // number of nodes and elements on each subdomain
+    int loc_nnodes, loc_nelms;
+    int *nnodes, *nelms;
+
+    // arrays for visualization
+    //int loc_I_vis, loc_J_vis, loc_K_vis;
+    int i_start_vis, j_start_vis, k_start_vis;
+    int i_end_vis, j_end_vis, k_end_vis;
+    int loc_nnodes_vis, loc_nelms_vis;
+    int *nnodes_vis, *nelms_vis;
+    int offset_nnodes_vis=0, offset_nelms_vis=0;
+
+public:
+    // 3d arrays
+    CUSTOMREAL *xi_loc;    // local xi
+    CUSTOMREAL *eta_loc;   // local eta
+    CUSTOMREAL *zeta_loc;  // local zeta
+    CUSTOMREAL *fac_a_loc; // factor a
+    CUSTOMREAL *fac_b_loc; // factor b
+    CUSTOMREAL *fac_c_loc; // factor c
+    CUSTOMREAL *fac_f_loc; // factor f
+    CUSTOMREAL *fun_loc;
+    CUSTOMREAL *T_loc;
+    CUSTOMREAL *T0v_loc, *T0r_loc, *T0p_loc, *T0t_loc;
+    CUSTOMREAL *tau_loc;
+    CUSTOMREAL *tau_old_loc;
+    bool       *is_changed;
+    // for inversion backup
+    CUSTOMREAL *fun_loc_back;
+    CUSTOMREAL *xi_loc_back;
+    CUSTOMREAL *eta_loc_back;
+    CUSTOMREAL *fac_b_loc_back;
+    CUSTOMREAL *fac_c_loc_back;
+    CUSTOMREAL *fac_f_loc_back;
+    // for lbfgs
+    CUSTOMREAL *Ks_grad_store_loc, *Keta_grad_store_loc, *Kxi_grad_store_loc;
+    CUSTOMREAL *Ks_model_store_loc, *Keta_model_store_loc, *Kxi_model_store_loc;
+    CUSTOMREAL *Ks_descent_dir_loc, *Keta_descent_dir_loc, *Kxi_descent_dir_loc;
+    CUSTOMREAL *fun_regularization_penalty_loc, *eta_regularization_penalty_loc, *xi_regularization_penalty_loc;
+    CUSTOMREAL *fun_gradient_regularization_penalty_loc, *eta_gradient_regularization_penalty_loc, *xi_gradient_regularization_penalty_loc;
+    CUSTOMREAL *fun_prior_loc, *eta_prior_loc, *xi_prior_loc; // *zeta_prior_loc; TODO
+    // tmp array for file IO
+    CUSTOMREAL *vis_data;
+
+private:
+    // windows for shm arrays
+    MPI_Win win_fac_a_loc, win_fac_b_loc, win_fac_c_loc, win_fac_f_loc;
+    MPI_Win win_T0r_loc, win_T0p_loc, win_T0t_loc, win_T0v_loc;
+    MPI_Win win_tau_loc, win_fun_loc;
+    MPI_Win win_is_changed;
+    MPI_Win win_T_loc, win_tau_old_loc;
+    MPI_Win win_xi_loc, win_eta_loc, win_zeta_loc;
+    MPI_Win win_r_loc_1d, win_t_loc_1d, win_p_loc_1d;
+
+    CUSTOMREAL *x_loc_3d;     // local (lon) x (global position)
+    CUSTOMREAL *y_loc_3d;     // local (lat) y (global position)
+    CUSTOMREAL *z_loc_3d;     // local (r  ) z (global position)
+    CUSTOMREAL *p_loc_3d;     // local lon (x) (global position)
+    CUSTOMREAL *t_loc_3d;     // local lat (y) (global position)
+    CUSTOMREAL *r_loc_3d;     // local r   (z) (global position)
+    int        *elms_conn;    // connectivity array
+    int        *my_proc_dump; // dump process id for each node  DEBUG
+public:
+    // 1d arrays for coordinates, storing only subdomain's local coordinates
+    CUSTOMREAL *r_loc_1d; // radius z    in kilo meter
+    CUSTOMREAL *t_loc_1d; // theta lat y in radian
+    CUSTOMREAL *p_loc_1d; // phi lon x   in radian
+private:
+    // arrays for inversion
+    // TODO: check if those arrays can use shm
+    bool inverse_flag = false;
+    //int n_inv_grids; // in config.h
+    //int n_inv_I_loc, n_inv_J_loc, n_inv_K_loc; // in config.h
+public:
+    CUSTOMREAL *Ks_loc;
+    CUSTOMREAL *Kxi_loc;
+    CUSTOMREAL *Keta_loc;
+    CUSTOMREAL *Ks_density_loc;
+    CUSTOMREAL *Kxi_density_loc;
+    CUSTOMREAL *Keta_density_loc;
+    CUSTOMREAL *Tadj_loc; // timetable for adjoint source
+    CUSTOMREAL *Tadj_density_loc; // timetable for density of adjoint source
+    CUSTOMREAL *Ks_inv_loc;
+    CUSTOMREAL *Kxi_inv_loc;
+    CUSTOMREAL *Keta_inv_loc;
+    CUSTOMREAL *Ks_density_inv_loc;
+    CUSTOMREAL *Kxi_density_inv_loc;
+    CUSTOMREAL *Keta_density_inv_loc;
+    // model update para
+    CUSTOMREAL *Ks_update_loc;
+    CUSTOMREAL *Kxi_update_loc;
+    CUSTOMREAL *Keta_update_loc;
+    CUSTOMREAL *Ks_density_update_loc;
+    CUSTOMREAL *Kxi_density_update_loc;
+    CUSTOMREAL *Keta_density_update_loc;
+    // model update para of the previous step
+    CUSTOMREAL *Ks_update_loc_previous;
+    CUSTOMREAL *Kxi_update_loc_previous;
+    CUSTOMREAL *Keta_update_loc_previous;
+private:
+    MPI_Win     win_Tadj_loc;
+    MPI_Win     win_Tadj_density_loc;
+    // boundary layer for mpi communication
+    // storing not the value but the pointers for the elements of arrays
+    // to send
+    CUSTOMREAL *bin_s; // boundary i axis negative
+    CUSTOMREAL *bip_s; // boundary i axis positive
+    CUSTOMREAL *bjn_s; // boundary j axis negative
+    CUSTOMREAL *bjp_s; // boundary j axis positive
+    CUSTOMREAL *bkn_s; // boundary k axis negative
+    CUSTOMREAL *bkp_s; // boundary k axis positive
+    // to receive
+    CUSTOMREAL *bin_r; // boundary i axis negative
+    CUSTOMREAL *bip_r; // boundary i axis positive
+    CUSTOMREAL *bjn_r; // boundary j axis negative
+    CUSTOMREAL *bjp_r; // boundary j axis positive
+    CUSTOMREAL *bkn_r; // boundary k axis negative
+    CUSTOMREAL *bkp_r; // boundary k axis positive
+
+    // kosumi boundaries (touching on lines and points)
+    CUSTOMREAL *bij_pp_s, *bij_pp_r;
+    CUSTOMREAL *bij_pn_s, *bij_pn_r;
+    CUSTOMREAL *bij_np_s, *bij_np_r;
+    CUSTOMREAL *bij_nn_s, *bij_nn_r;
+    CUSTOMREAL *bjk_pp_s, *bjk_pp_r;
+    CUSTOMREAL *bjk_pn_s, *bjk_pn_r;
+    CUSTOMREAL *bjk_np_s, *bjk_np_r;
+    CUSTOMREAL *bjk_nn_s, *bjk_nn_r;
+    CUSTOMREAL *bik_pp_s, *bik_pp_r;
+    CUSTOMREAL *bik_pn_s, *bik_pn_r;
+    CUSTOMREAL *bik_np_s, *bik_np_r;
+    CUSTOMREAL *bik_nn_s, *bik_nn_r;
+    CUSTOMREAL *bijk_ppp_s,*bijk_ppp_r;
+    CUSTOMREAL *bijk_ppn_s,*bijk_ppn_r;
+    CUSTOMREAL *bijk_pnn_s,*bijk_pnn_r;
+    CUSTOMREAL *bijk_npp_s,*bijk_npp_r;
+    CUSTOMREAL *bijk_npn_s,*bijk_npn_r;
+    CUSTOMREAL *bijk_nnp_s,*bijk_nnp_r;
+    CUSTOMREAL *bijk_nnn_s,*bijk_nnn_r;
+    CUSTOMREAL *bijk_pnp_s,*bijk_pnp_r;
+
+    // store MPI_Request for sending and receiving
+    MPI_Request *mpi_send_reqs;
+    MPI_Request *mpi_recv_reqs;
+    MPI_Request *mpi_send_reqs_kosumi;
+    MPI_Request *mpi_recv_reqs_kosumi;
+    //
+    // domain definition
+    //
+    CUSTOMREAL r_min;   // minimum radius of global domain
+    CUSTOMREAL r_max;   // maximum radius of global domain
+    CUSTOMREAL lat_min; // minimum latitude of global domain
+    CUSTOMREAL lat_max; // maximum latitude of global domain
+    CUSTOMREAL lon_min; // minimum longitude of global domain
+    CUSTOMREAL lon_max; // maximum longitude of global domain
+public:
+    CUSTOMREAL dr  ;
+    CUSTOMREAL dlat,dt;
+    CUSTOMREAL dlon,dp;
+private:
+
+    //
+    // members for test
+    //
+    CUSTOMREAL *u_loc;    // true solution # TODO: erase for no testing
+    //CUSTOMREAL *velo_loc; // velocity field, # TODO: use this for storing an intial model
+    // anisotropic factors
+    CUSTOMREAL a0, b0, c0, f0, fun0;
+
+    CUSTOMREAL source_width;
+    //
+    // member functions
+    //
+    void init_decomposition(InputParams&);   // initialize domain decomposition
+    void memory_allocation();                // allocate memory for arrays
+    void memory_deallocation();              // deallocate memory from arrays
+    void shm_memory_allocation();            // allocate memory for shared memory
+    void shm_memory_deallocation();          // deallocate memory from shared memory
+
+    CUSTOMREAL *get_array_for_vis(CUSTOMREAL *arr, bool);  // get array for visualization
+public:
+    void get_array_for_3d_output(const CUSTOMREAL *arr_in, CUSTOMREAL* arr_out, bool inverse_field);  // get array for 3d output
+    void        set_array_from_vis(CUSTOMREAL *arr); // set vis array to local array
+
+    // finalize the Time table
+    void calc_T_plus_tau();
+private:
+    // check difference between true solution and computed solution
+    void calc_residual();
+
+    //
+    // utilities
+    //
+public:
+    bool i_first(){ return neighbors_id[0] == -1;}
+    bool j_first(){ return neighbors_id[2] == -1;}
+    bool k_first(){ return neighbors_id[4] == -1;}
+    bool i_last(){  return neighbors_id[1] == -1;}
+    bool j_last(){  return neighbors_id[3] == -1;}
+    bool k_last(){  return neighbors_id[5] == -1;}
+private:
+    // number of the layers for ghost nodes
+    const int n_ghost_layers = 1;
+    // vtk format cell type
+    const int cell_type = 9;
+
+
+};
+
+#endif // GRID_H
--- a/include/input_params.h
+++ b/include/input_params.h
@@ -0,0 +1,574 @@
+#ifndef INPUT_PARAMS_H
+#define INPUT_PARAMS_H
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include <iomanip>
+#include <unordered_set>
+#include <map>
+#include <stdexcept>
+
+#include "config.h"
+#include "yaml-cpp/yaml.h"
+#include "utils.h"
+#include "mpi_funcs.h"
+#include "timer.h"
+#include "src_rec.h"
+
+
+// InputParams class for reading/storing and outputing input parameters and src_rec information
+class InputParams {
+
+public:
+    InputParams(std::string&);
+    ~InputParams();
+
+    // write parameters to output file
+    void write_params_to_file();
+
+    //
+    // getter/setter
+    //
+
+    //
+    // bondary information
+    //
+    CUSTOMREAL get_min_dep(){return min_dep;};
+    CUSTOMREAL get_max_dep(){return max_dep;};
+    CUSTOMREAL get_min_lat(){return min_lat*DEG2RAD;};
+    CUSTOMREAL get_max_lat(){return max_lat*DEG2RAD;};
+    CUSTOMREAL get_min_lon(){return min_lon*DEG2RAD;};
+    CUSTOMREAL get_max_lon(){return max_lon*DEG2RAD;};
+
+    //
+    // source receiver data
+    //
+
+    // source receiver information for processes which stores source receiver information
+    std::string                 get_src_rec_file()      {return src_rec_file;};
+    bool                        get_src_rec_file_exist(){return src_rec_file_exist;};
+    SrcRecInfo&                 get_src_point(const std::string&);          // return SrcRec object
+    SrcRecInfo&                 get_rec_point(const std::string&);          // return receivers for the current source
+
+    // source receiver information with broadcast to all subdom_main processes
+    SrcRecInfo                  get_src_point_bcast(const std::string&);    // return SrcRec object
+    SrcRecInfo                  get_rec_point_bcast(const std::string&);    // return receivers for the current source
+    CUSTOMREAL                  get_src_radius(   const std::string&);
+    CUSTOMREAL                  get_src_lat(      const std::string&);
+    CUSTOMREAL                  get_src_lon(      const std::string&);
+    SrcRecInfo                  get_src_point_bcast_2d(const std::string&);    // return SrcRec object
+    CUSTOMREAL                  get_src_radius_2d(const std::string&);
+    CUSTOMREAL                  get_src_lat_2d(   const std::string&);
+    CUSTOMREAL                  get_src_lon_2d(   const std::string&);
+    std::string                 get_src_name(const int&);                   // return source name from in-sim_group id
+    std::string                 get_rec_name(const int&);                   // return receiver name from in-sim_group id
+    std::string                 get_src_name_comm(const int&);              // return source name in common receiver list
+    int                         get_src_id(const std::string&);             // return src global id from src name
+    bool                        get_if_src_teleseismic(const std::string&); // return true if the source is teleseismic
+
+    //
+    // others
+    //
+    CUSTOMREAL get_conv_tol()                    {return conv_tol;};
+    void       set_conv_tol(CUSTOMREAL conv_tol_){conv_tol = conv_tol_;};
+    CUSTOMREAL get_max_iter()                    {return max_iter;};
+
+    int  get_stencil_order()                  {return stencil_order;};
+    void set_stencil_order(int stencil_order_){stencil_order = stencil_order_;};
+    int  get_stencil_type()                   {return stencil_type;};
+    int  get_sweep_type()                     {return sweep_type;};
+
+    std::string get_init_model_path(){return init_model_path;};
+    std::string get_model_1d_name()  {return model_1d_name;};
+
+    int get_run_mode()        {return run_mode;};
+
+    // invgrid for vel
+    int get_n_inversion_grid(){return n_inversion_grid;};
+
+    int        get_n_inv_r()    {return n_inv_r;};
+    int        get_n_inv_t()    {return n_inv_t;};
+    int        get_n_inv_p()    {return n_inv_p;};
+    int        get_n_inv_r_ani(){return n_inv_r_ani;};
+    int        get_n_inv_t_ani(){return n_inv_t_ani;};
+    int        get_n_inv_p_ani(){return n_inv_p_ani;};
+    CUSTOMREAL get_min_dep_inv(){return min_dep_inv;};
+    CUSTOMREAL get_max_dep_inv(){return max_dep_inv;};
+    CUSTOMREAL get_min_lat_inv(){return min_lat_inv*DEG2RAD;};
+    CUSTOMREAL get_max_lat_inv(){return max_lat_inv*DEG2RAD;};
+    CUSTOMREAL get_min_lon_inv(){return min_lon_inv*DEG2RAD;};
+    CUSTOMREAL get_max_lon_inv(){return max_lon_inv*DEG2RAD;};
+    CUSTOMREAL get_min_dep_inv_ani(){return min_dep_inv_ani;};
+    CUSTOMREAL get_max_dep_inv_ani(){return max_dep_inv_ani;};
+    CUSTOMREAL get_min_lat_inv_ani(){return min_lat_inv_ani*DEG2RAD;};
+    CUSTOMREAL get_max_lat_inv_ani(){return max_lat_inv_ani*DEG2RAD;};
+    CUSTOMREAL get_min_lon_inv_ani(){return min_lon_inv_ani*DEG2RAD;};
+    CUSTOMREAL get_max_lon_inv_ani(){return max_lon_inv_ani*DEG2RAD;};
+
+    int         get_n_inv_r_flex(){return n_inv_r_flex;};
+    int         get_n_inv_t_flex(){return n_inv_t_flex;};
+    int         get_n_inv_p_flex(){return n_inv_p_flex;};
+    CUSTOMREAL* get_dep_inv()     {return dep_inv;};
+    CUSTOMREAL* get_lat_inv()     {return lat_inv;};
+    CUSTOMREAL* get_lon_inv()     {return lon_inv;};
+    CUSTOMREAL* get_trapezoid()   {return trapezoid;};
+
+    bool        get_ignore_velocity_discontinuity(){return ignore_velocity_discontinuity;};
+
+    // invgrid for ani
+    bool        get_invgrid_ani()     {return invgrid_ani;};
+
+    int         get_n_inv_r_flex_ani(){return n_inv_r_flex_ani;};
+    int         get_n_inv_t_flex_ani(){return n_inv_t_flex_ani;};
+    int         get_n_inv_p_flex_ani(){return n_inv_p_flex_ani;};
+    CUSTOMREAL* get_dep_inv_ani()     {return dep_inv_ani;};
+    CUSTOMREAL* get_lat_inv_ani()     {return lat_inv_ani;};
+    CUSTOMREAL* get_lon_inv_ani()     {return lon_inv_ani;};
+    CUSTOMREAL* get_trapezoid_ani()   {return trapezoid_ani;};
+
+    //
+    void check_inv_grid();
+private:
+    void check_increasing(CUSTOMREAL*, int, std::string);
+    void check_lower_bound(CUSTOMREAL*&, int&, CUSTOMREAL, std::string);
+    void check_upper_bound(CUSTOMREAL*&, int&, CUSTOMREAL, std::string);
+
+public:
+    // void check_dep_inv();
+    // void check_lat_inv();
+    // void check_lon_inv();
+    // void check_dep_inv_ani();
+    // void check_lat_inv_ani();
+    // void check_lon_inv_ani();
+
+    // // adjust inv grid
+    // void insertAtFront(CUSTOMREAL*&, int&, CUSTOMREAL);
+    // void insertAtBack(CUSTOMREAL*&, int&, CUSTOMREAL);
+
+    // bool get_invgrid_volume_rescale(){return invgrid_volume_rescale;}
+
+    int  get_max_iter_inv() {return max_iter_inv;};
+
+    int  get_model_update_N_iter(){return model_update_N_iter;};
+    int  get_relocation_N_iter()  {return relocation_N_iter;};
+    int  get_max_loop_mode0()           {return max_loop_mode0;};
+    int  get_max_loop_mode1()           {return max_loop_mode1;};
+
+    bool get_is_srcrec_swap() {return swap_src_rec;};
+
+    bool get_if_output_source_field()    {return output_source_field;};
+    bool get_if_output_kernel()          {return output_kernel;};
+    bool get_if_output_final_model()     {return output_final_model;};
+    bool get_if_output_middle_model()     {return output_middle_model;};
+    bool get_if_output_in_process()      {return output_in_process;};
+    bool get_if_output_in_process_data() {return output_in_process_data;};
+    bool get_if_single_precision_output(){return single_precision_output;};
+    int  get_verbose_output_level()      {return verbose_output_level;}; // #TODO: modify codes for verbose_output_level > 1
+
+    bool get_update_slowness()        {return update_slowness;};
+    bool get_update_azi_ani()         {return update_azi_ani;};
+    bool get_update_rad_ani()         {return update_rad_ani;};
+    CUSTOMREAL * get_depth_taper()    {return depth_taper;};
+
+    bool get_have_tele_data()         {return have_tele_data;};
+
+    bool get_use_abs()                  {return use_abs;};
+    bool get_use_cs()                   {return use_cs;};
+    bool get_use_cr()                   {return use_cr;};
+
+    bool get_use_abs_reloc()                  {return use_abs_reloc;};
+    bool get_use_cs_reloc()                   {return use_cs_reloc;};
+    bool get_use_cr_reloc()                   {return use_cr_reloc;};
+
+    CUSTOMREAL* get_residual_weight_abs()   {return residual_weight_abs;};
+    CUSTOMREAL* get_distance_weight_abs()   {return distance_weight_abs;};
+    CUSTOMREAL* get_residual_weight_cs()    {return residual_weight_cs;};
+    CUSTOMREAL* get_azimuthal_weight_cs()   {return azimuthal_weight_cs;};
+    CUSTOMREAL* get_residual_weight_cr()    {return residual_weight_cr;};
+    CUSTOMREAL* get_azimuthal_weight_cr()   {return azimuthal_weight_cr;};
+
+
+    CUSTOMREAL* get_residual_weight_abs_reloc()   {return residual_weight_abs_reloc;};
+    CUSTOMREAL* get_distance_weight_abs_reloc()   {return distance_weight_abs_reloc;};
+    CUSTOMREAL* get_residual_weight_cs_reloc()    {return residual_weight_cs_reloc;};
+    CUSTOMREAL* get_azimuthal_weight_cs_reloc()   {return azimuthal_weight_cs_reloc;};
+    CUSTOMREAL* get_residual_weight_cr_reloc()    {return residual_weight_cr_reloc;};
+    CUSTOMREAL* get_azimuthal_weight_cr_reloc()   {return azimuthal_weight_cr_reloc;};
+
+    // get if the T field is written into the file
+    bool get_is_T_written_into_file(const std::string&);
+
+    // prepare source list for this simulation group
+    void prepare_src_map();
+
+    // (relocation) modify (swapped source) receiver's location and time
+    void modify_swapped_source_location();
+
+    // write out src_rec_file
+    void write_src_rec_file(int,int);
+
+    // write out station_correction_file
+    void write_station_correction_file(int);
+
+    // station correction
+    void station_correction_update( CUSTOMREAL );
+
+    int n_src_this_sim_group = 0;          // number of sources in this simultaneous group
+    int n_src_comm_rec_this_sim_group = 0; // number of sources with common receiver in this simultaneous group
+    int n_src_2d_this_sim_group = 0;       // number of sources assigned for 2d solver in this simultaneous group
+    int n_rec_this_sim_group = 0;          // number of receivers in this simultaneous group
+
+    std::map<std::string, SrcRecInfo> src_map_all;          // map of all sources (full information is only stored by the main process)
+    std::map<std::string, SrcRecInfo> src_map;              // map of sources belonging to this simultaneous group
+    std::map<std::string, SrcRecInfo> src_map_comm_rec;     // map of sources with common receiver
+    std::map<std::string, SrcRecInfo> src_map_2d;           // map of sources assigned for 2d solver
+    std::map<std::string, SrcRecInfo> src_map_tele;         // source list for teleseismic
+
+    std::map<std::string, SrcRecInfo> rec_map_all;     // map of all receivers (full information is only stored by the main process)
+    std::map<std::string, SrcRecInfo> rec_map;         // map of receivers belonging to this simultaneous group
+    std::map<std::string, SrcRecInfo> rec_map_tele;    // rec list for teleseismic
+
+    // datainfo-vector maps <src_name, rec_name>
+    std::map< std::string, std::map<std::string, std::vector<DataInfo>>> data_map_all;     // data list for all data (full information is only stored by the main process)
+    std::map< std::string, std::map<std::string, std::vector<DataInfo>>> data_map;         // data list for this simultaneous group
+    std::map< std::string, std::map<std::string, std::vector<DataInfo>>> data_map_tele;    // data list for teleseismic
+
+    std::vector<std::string> name_for_reloc;    // name list of receivers (swarpped sources) for location
+
+    // src id <-> src name relations
+    std::vector<std::string>                           src_id2name;          // name list of sources belonging to this simultaneous group
+    std::vector<std::string>                           rec_id2name;          // name list of receivers belongig to this simultaneous group
+    std::vector<std::string>                           src_id2name_comm_rec; // name list of sources with common receiver
+    std::vector<std::string>                           src_id2name_2d;       // name list of sources assigned for 2d solver.
+    std::vector<std::string>                           src_id2name_all;      // name list of all sources (store the order of sources in src_rec file)
+    std::vector<std::string>                           src_id2name_back;     // back up of name list of all sources (this will not be swapped)
+    std::vector<std::vector<std::vector<std::string>>> rec_id2name_back;     // back up of the name list of all receivers for each source (this will not be swapped)
+
+    // backups used when outputing the data
+    std::map<std::string, SrcRecInfo>                                   src_map_back;
+    std::map<std::string, SrcRecInfo>                                   rec_map_back;
+    std::map<std::string, std::map<std::string, std::vector<DataInfo>>> data_map_back;
+
+    // the number of data
+    int N_abs_local_data    = 0;
+    int N_cr_dif_local_data = 0;
+    int N_cs_dif_local_data = 0;
+    int N_teleseismic_data  = 0;
+    int N_data              = 0;
+    // the number of the types of data
+    int N_data_type = 0;
+    std::map<std::string, int> data_type;     // element: "abs", "cs_dif", "cr_dif", "tele"
+
+    // initialize_adjoint_source
+    void initialize_adjoint_source();
+    // set adjoint source && adjoint source density
+    void set_adjoint_source(std::string, CUSTOMREAL);
+    void set_adjoint_source_density(std::string, CUSTOMREAL);
+
+    // gather traveltimes and calculate differences of synthetic data
+    void gather_traveltimes_and_calc_syn_diff();
+
+    // reduce necessary data in rec_map, which has differed elements in each sim group.
+    template <typename T>
+    void allreduce_rec_map_var(T&);
+
+    void allreduce_rec_map_vobj_src_reloc();
+    void allreduce_rec_map_grad_src();
+
+private:
+    // boundary information
+    CUSTOMREAL min_dep; // minimum depth in km
+    CUSTOMREAL max_dep; // maximum depth in km
+    CUSTOMREAL min_lat; // minimum latitude in degree
+    CUSTOMREAL max_lat; // maximum latitude in degree
+    CUSTOMREAL min_lon; // minimum longitude in degree
+    CUSTOMREAL max_lon; // maximum longitude in degree
+
+    // source information
+    CUSTOMREAL  src_dep;                     // source depth in km
+    CUSTOMREAL  src_lat;                     // source latitude in degrees
+    CUSTOMREAL  src_lon;                     // source longitude in degrees
+    std::string src_rec_file;                // source receiver file
+    std::string src_rec_file_out;            // source receiver file to be output
+    std::string sta_correction_file;         // station correction file to be input
+    std::string station_correction_file_out; // station correction file to be output
+    bool src_rec_file_exist        = false;  // source receiver file exist
+    bool sta_correction_file_exist = false;  // station correction file exist
+    bool swap_src_rec              = false;  // whether the src/rec are swapped or not
+
+    // model input files
+    std::string init_model_path; // model file path init
+    std::string model_1d_name;   // name of 1d model for teleseismic tomography
+
+    // inversion
+    int run_mode=0;                                                 // do inversion or not (0: no, 1: yes)
+    int n_inversion_grid=1;                                         // number of inversion grid
+
+    // inversion grid setting
+    static const int n_trapezoid = 3;
+    CUSTOMREAL trapezoid[n_trapezoid]     = {1.0, 0.0, 50.0};
+    CUSTOMREAL trapezoid_ani[n_trapezoid] = {1.0, 0.0, 50.0};
+
+
+    // uniform or flexible inversion grid (0: uniform, 1: flexible, 2: trapezoid)
+    // int type_invgrid_dep=0;
+    // int type_invgrid_lat=0;
+    // int type_invgrid_lon=0;
+    // uniform or flexible anisotropic inversion grid (0: uniform, 1: flexible, 2: trapezoid)
+    // int type_invgrid_dep_ani=0;
+    // int type_invgrid_lat_ani=0;
+    // int type_invgrid_lon_ani=0;
+
+    //
+    // variables for type = 0: uniform inversion grid
+    //
+
+    bool uniform_inv_grid_dep = false; // automatically generate inversion grid for dep or not
+    bool uniform_inv_grid_lat = false; // automatically generate inversion grid for lat or not
+    bool uniform_inv_grid_lon = false; // automatically generate inversion grid for lon or not
+    // number of uniform inversion grid nodes in r, t, p direction
+    int n_inv_r=0;
+    int n_inv_t=0;
+    int n_inv_p=0;
+    // number of uniform anisotropic inversion grid nodes in r, t, p direction
+    int n_inv_r_ani=0;
+    int n_inv_t_ani=0;
+    int n_inv_p_ani=0;
+
+    // type = 0: uniform inversion grid
+    // inversion grid
+    CUSTOMREAL min_dep_inv=-99999; // minimum depth in km
+    CUSTOMREAL max_dep_inv=-99999; // maximum depth in km
+    CUSTOMREAL min_lat_inv=-99999; // minimum latitude
+    CUSTOMREAL max_lat_inv=-99999; // maximum latitude
+    CUSTOMREAL min_lon_inv=-99999; // minimum longitude
+    CUSTOMREAL max_lon_inv=-99999; // maximum longitude
+
+    // inversion grid for anisotropic parameters
+    CUSTOMREAL min_dep_inv_ani=-99999; // minimum depth in km
+    CUSTOMREAL max_dep_inv_ani=-99999; // maximum depth in km
+    CUSTOMREAL min_lat_inv_ani=-99999; // minimum latitude
+    CUSTOMREAL max_lat_inv_ani=-99999; // maximum latitude
+    CUSTOMREAL min_lon_inv_ani=-99999; // minimum longitude
+    CUSTOMREAL max_lon_inv_ani=-99999; // maximum longitude
+    void setup_uniform_inv_grid();
+
+    //
+    // variables fo type = 1: flexible inversion grid
+    //
+
+    CUSTOMREAL *dep_inv     = nullptr; // array for storing inversion grid points in depth
+    CUSTOMREAL *lat_inv     = nullptr; // array for storing inversion grid points in latitude
+    CUSTOMREAL *lon_inv     = nullptr; // array for storing inversion grid points in longitude
+    CUSTOMREAL *dep_inv_ani = nullptr; // array for storing inversion grid points in depth for anisotropy
+    CUSTOMREAL *lat_inv_ani = nullptr; // array for storing inversion grid points in latitude for anisotropy
+    CUSTOMREAL *lon_inv_ani = nullptr; // array for storing inversion grid points in longitude for anisotropy
+
+    // number of flexibly designed inversion grid in r, t, p direction
+    int n_inv_r_flex=0;
+    int n_inv_t_flex=0;
+    int n_inv_p_flex=0;
+    // number of flexibly designed inversion grid in r, t, p direction
+    int n_inv_r_flex_ani=0;
+    int n_inv_t_flex_ani=0;
+    int n_inv_p_flex_ani=0;
+
+    // if true, use defined inversion grid for anisotropy. Otherwise, use the same inversion grid of velocity for anisotropy
+    bool invgrid_ani = false;
+
+    // inversion grid volume rescale (kernel -> kernel / volume of inversion grid mesh)
+    // bool invgrid_volume_rescale = false;
+
+    // date usage setting and weights
+    bool use_abs = false; // use absolute travel time or not
+    bool use_cs  = false; // use common source double difference or not
+    bool use_cr  = false; // use common receiver double difference or not
+    static const int n_weight = 4;
+    CUSTOMREAL residual_weight_abs[n_weight] = {   1.0,   3.0, 1.0, 1.0};
+    CUSTOMREAL residual_weight_cs[n_weight]  = {   1.0,   3.0, 1.0, 1.0};
+    CUSTOMREAL residual_weight_cr[n_weight]  = {   1.0,   3.0, 1.0, 1.0};
+    CUSTOMREAL distance_weight_abs[n_weight] = { 100.0, 200.0, 1.0, 1.0};
+    CUSTOMREAL azimuthal_weight_cs[n_weight] = {  15.0,  30.0, 1.0, 1.0};
+    CUSTOMREAL azimuthal_weight_cr[n_weight] = {  15.0,  30.0, 1.0, 1.0};
+
+    // for relocation
+    bool use_abs_reloc = false; // use absolute travel time or not
+    bool use_cs_reloc  = false; // use common source double difference or not
+    bool use_cr_reloc  = false; // use common source double difference or not
+    CUSTOMREAL residual_weight_abs_reloc[n_weight] = {   1.0,   3.0, 1.0, 1.0};
+    CUSTOMREAL distance_weight_abs_reloc[n_weight] = {   1.0,   3.0, 1.0, 1.0};
+    CUSTOMREAL residual_weight_cs_reloc[n_weight]  = {   1.0,   3.0, 1.0, 1.0};
+    CUSTOMREAL azimuthal_weight_cs_reloc[n_weight] = { 100.0, 200.0, 1.0, 1.0};
+    CUSTOMREAL residual_weight_cr_reloc[n_weight]  = {  15.0,  30.0, 1.0, 1.0};
+    CUSTOMREAL azimuthal_weight_cr_reloc[n_weight] = {  15.0,  30.0, 1.0, 1.0};
+
+    // convergence setting
+    CUSTOMREAL conv_tol = 1e-05;     // convergence tolerance
+    int        max_iter = 500;       // maximum number of iterations
+    int        max_iter_inv=1;       // maximum number of iterations for inversion
+
+    // calculation method
+    int stencil_order = 1; // stencil order
+    int stencil_type  = 1; // stencil type  (0: non upwind; 1: upwind)
+    int sweep_type    = 0; // sweep type (0: legacy, 1: cuthil-mckee with shm parallelization)
+
+    // gather all arrival times to a main process
+    void gather_all_arrival_times_to_main();
+    // gather rec info to main process
+    void gather_rec_info_to_main();
+
+    // generate a map of sources which include common receiver double difference data
+    void generate_src_map_with_common_receiver(std::map<std::string, std::map<std::string, std::vector<DataInfo>>>&,
+                                             std::map<std::string, SrcRecInfo>&,
+                                             std::vector<std::string>&);
+
+    bool i_first=false, i_last=false, \
+         j_first=false, j_last=false, \
+         k_first=false; // store info if this subdomain has outer boundary
+
+    // check contradictions in input parameters
+    void check_contradictions();
+
+    // output setting
+    bool output_source_field    = false; // output out_data_sim_X.h or not.
+    bool output_kernel          = false; // output kernel or not.
+    bool output_final_model     = true;  // output merged final model or not.
+    bool output_middle_model    = false; // output merged model at each inv iteration or not.
+    bool output_in_process      = true;  // True: output at each inv iteration, otherwise, only output step 0, Niter-1, Niter. Default: true. File: "out_data_sim_group_0". Keys: "Kdensity_update_inv_XXXX", "vel_inv_0000", "xi_inv_0000", "eta_inv_0000"
+    bool output_in_process_data = true;  // output src_rec_file at each inv iteration or not.
+    int  verbose_output_level   = 0;     // output verbose information or not.
+
+    // inversion setting
+    bool update_slowness = true;  // update slowness (velocity) or not.
+    bool update_azi_ani  = false; // update azimuthal anisotropy (xi, eta) or not.
+    bool update_rad_ani  = false; // update radial anisotropy (in future) or not.
+
+    // have have_tele_data or not?
+    bool have_tele_data = false;  // default is false. Error will occur if teleseismic data is include but false.
+
+    // ignore velocity discontinuity
+    bool ignore_velocity_discontinuity = false; // default is false. Error will occur if model velocity has discontinuity ( v[i+1] > v[i] * 1.2 or v[i+1] < v[i] * 0.8).
+
+    CUSTOMREAL depth_taper[2] = {-9999999, -9999998};   // kernel weight:  0: -inf ~ taper[0]; 0 ~ 1 : taper[0] ~ taper[1]; 1 : taper[1] ~ inf
+    bool use_sta_correction = false; // apply station correction or not.
+
+    // single precision (float) output mode
+    bool single_precision_output = false;
+
+
+};
+
+
+//
+// utils
+//
+inline DataInfo& get_data_src_rec(std::vector<DataInfo>& v){
+    // return the first element in the vector with is_rec_pair = true
+    for (auto it = v.begin(); it != v.end(); it++){
+        if (it->is_src_rec)
+            return *it;
+    }
+
+    // error if no rec pair is found
+    std::cerr << "Error: no src/rec is found in get_data_src_rec" << std::endl;
+    exit(1);
+
+    // return the first element in the vector as a dummy
+    return v[0];
+}
+
+
+inline DataInfo& get_data_rec_pair(std::map<std::string, std::map<std::string, std::vector<DataInfo>>>& v,
+                                   const std::string& name_src,
+                                   const std::string& name_rec1,
+                                   const std::string& name_rec2){
+    // return the first element in the vector with is_rec_pair = true
+    auto& map1 = v[name_src][name_rec1];
+    auto& map2 = v[name_src][name_rec2];
+
+    for (auto it = map1.begin(); it != map1.end(); it++){
+        if (it->is_rec_pair && it->name_rec_pair[0] == name_rec1 && it->name_rec_pair[1] == name_rec2)
+            return *it;
+    }
+
+    for (auto it = map2.begin(); it != map2.end(); it++){
+        if (it->is_rec_pair && it->name_rec_pair[0] == name_rec1 && it->name_rec_pair[1] == name_rec2)
+            return *it;
+    }
+
+    // error if no rec pair is found
+    std::cerr << "Error: no rec pair is found in get_data_rec_pair" << std::endl;
+    exit(1);
+
+    // return the first element in the vector as a dummy
+    return v[name_src][name_rec1][0];
+}
+
+
+inline DataInfo& get_data_src_pair(std::map<std::string, std::map<std::string, std::vector<DataInfo>>>& v,
+                                   const std::string& name_src1,
+                                   const std::string& name_src2,
+                                   const std::string& name_rec){
+
+    // return the first element in the vector with is_rec_pair = true
+    auto& map1 = v[name_src1][name_rec];
+    auto& map2 = v[name_src2][name_rec];
+
+    for (auto it = map1.begin(); it != map1.end(); it++){
+        if (it->is_src_pair && it->name_src_pair[0] == name_src1 && it->name_src_pair[1] == name_src2)
+            return *it;
+    }
+
+    for (auto it = map2.begin(); it != map2.end(); it++){
+        if (it->is_src_pair && it->name_src_pair[0] == name_src2 && it->name_src_pair[1] == name_src1)
+            return *it;
+    }
+
+    // error if no src pair is found
+    std::cerr << "Error: no src pair is found in get_data_src_pair" << std::endl;
+    exit(1);
+
+    // return the first element in the vector as a dummy
+    return v[name_src1][name_rec][0];
+}
+
+
+inline void set_cr_dif_to_src_pair(std::map<std::string, std::map< std::string, std::vector<DataInfo>>>& v,
+                                   std::string& name_src1,
+                                   std::string& name_src2,
+                                   std::string& name_rec,
+                                   CUSTOMREAL& cr_dif){
+    // return the first element in the vector with is_rec_pair = true
+
+    std::vector<DataInfo>& vdata = v[name_src1][name_rec];
+
+    for (auto it = vdata.begin(); it != vdata.end(); it++){
+        if (it->is_src_pair
+        && ( (it->name_src_pair[0] == name_src1 && it->name_src_pair[1] == name_src2)
+         ||  (it->name_src_pair[0] == name_src2 && it->name_src_pair[1] == name_src1) )) {
+            it->cr_dif_travel_time = cr_dif;
+        }
+    }
+}
+
+
+inline bool get_if_any_src_pair(std::vector<DataInfo>& v){
+    // return the first element in the vector with is_rec_pair = true
+    for (auto it = v.begin(); it != v.end(); it++){
+        if (it->is_src_pair)
+            return true;
+    }
+
+    // or maybe this will be enough
+    //return v.size() > 1;
+
+    // return false if no rec pair is found
+    return false;
+}
+
+
+#endif
--- a/include/inv_grid.h
+++ b/include/inv_grid.h
@@ -0,0 +1,75 @@
+#ifndef INV_GRID_H
+#define INV_GRID_H
+
+#include <algorithm>
+
+#include "config.h"
+#include "input_params.h"
+
+
+// Base class for 1D inversion grid ()
+class InvGrid1dBase {
+public:
+    InvGrid1dBase(){};
+    virtual ~InvGrid1dBase(){
+        delete[] arr;
+    };
+
+    // copy constructor
+    InvGrid1dBase(const InvGrid1dBase& other){
+        n = other.n;
+        dinv = other.dinv;
+        dinv_l = other.dinv_l;
+        arr = new CUSTOMREAL[n];
+        std::copy(other.arr, other.arr+n, arr);
+    }
+
+    // assignment operator
+    InvGrid1dBase& operator=(const InvGrid1dBase& other){
+        if (this != &other){
+            n = other.n;
+            dinv = other.dinv;
+            dinv_l = other.dinv_l;
+            arr = new CUSTOMREAL[n];
+            std::copy(other.arr, other.arr+n, arr);
+        }
+        return *this;
+    }
+
+    CUSTOMREAL* arr = nullptr; // 1d or 2d array storing the grid coordinates
+    int n; // number of grid points
+    CUSTOMREAL dinv; // grid spacing
+    CUSTOMREAL dinv_l; // amount of shift for each inversion grid
+};
+
+// Derived class for 1D inversion grid
+class InvGrid1d : public InvGrid1dBase {
+public:
+    InvGrid1d(InputParams&, const int, const CUSTOMREAL*); // for r grid
+    InvGrid1d(InputParams&, const int, const CUSTOMREAL*, const CUSTOMREAL*, const int, const CUSTOMREAL*); // function overload for t and p grids
+    ~InvGrid1d() override {};
+};
+
+
+// Base class for 3D inversion grid
+class InvGrid {
+public:
+    InvGrid(InputParams&);
+    ~InvGrid();
+
+    void write_inversion_grid_to_file();
+
+    InvGrid1dBase r;
+    InvGrid1dBase t;
+    InvGrid1dBase p;
+    InvGrid1dBase r_ani;
+    InvGrid1dBase t_ani;
+    InvGrid1dBase p_ani;
+
+private:
+    void get_inv_grid_params(InputParams&);
+};
+
+
+
+#endif // INV_GRID_H
--- a/include/io.fwd.h
+++ b/include/io.fwd.h
@@ -0,0 +1,2 @@
+#pragma once
+class IO_utils;
--- a/include/io.h
+++ b/include/io.h
@@ -0,0 +1,284 @@
+#ifndef IO_H
+#define IO_H
+
+#include <fstream>
+#include <vector>
+#if USE_HDF5
+    #include "hdf5.h"
+#endif
+#include "tinyxml2.h"
+
+// to handle circular dependency
+#pragma once
+#include "grid.fwd.h"
+//#include "source.fwd.h"
+#include "io.fwd.h"
+
+#include "utils.h"
+#include "mpi_funcs.h"
+#include "grid.h"
+#include "config.h"
+#include "input_params.h"
+#include "eikonal_solver_2d.h"
+
+
+class IO_utils {
+
+public:
+    IO_utils(InputParams&);
+    ~IO_utils();
+
+    // initialize data output file
+    void init_data_output_file();
+    // finalize data/grid outpiut file (xdmf)
+    void finalize_data_output_file();
+    // change the output file name and xdmf objects
+    void change_xdmf_obj(int);
+    // output for updating xdmf file
+    void update_xdmf_file();
+
+    // change group name for source
+    void reset_source_info(const int&, const std::string&);
+    // change group name for model
+    void change_group_name_for_model();
+    // prepare grid for each inversion iteration
+    void prepare_grid_inv_xdmf(int);
+
+    // set id_sim
+    void set_id_src(const int& id_src_){id_sim_src = id_src_;};
+    // set source name
+    void set_name_src(const std::string& name_src_){name_sim_src = name_src_;};
+
+
+    //
+    // write functions
+    //
+
+    // xdmf write out for grid data
+    void write_xdmf_file_grid();
+
+    // write grid data to output file
+    void write_grid(Grid&);
+    // general function for write out data in hdf5
+    void write_data_h5(Grid&, std::string&, std::string&, CUSTOMREAL*, int, bool, bool for_tmp_db=false);
+    // general function for write out data in hdf5 with merging subdomains
+    void write_data_merged_h5(Grid&, std::string&, std::string&, std::string&, CUSTOMREAL*, bool, bool no_group=true);
+    // general function for write out data in ascii
+    void write_data_ascii(Grid&, std::string&, CUSTOMREAL*);
+    // general function for write out data in ascii with merging subdomains
+    void write_data_merged_ascii(Grid&, std::string&);
+    // write true solution
+    void write_true_solution(Grid&);
+    // write velocity model
+    void write_vel(Grid&, int);
+    // write T0v
+    void write_T0v(Grid&, int);
+    // write u
+    void write_u(Grid&);
+    // write tau
+    void write_tau(Grid&, int);
+    // write temporal tau fields
+    //void write_tmp_tau(Grid&, int);
+    // write result timetable T
+    void write_T(Grid&, int);
+    // write temporal timetable T in a separated file
+    void write_T_tmp(Grid&);
+    // write residual (resudual = true_solution - result)
+    void write_residual(Grid&);
+    // write adjoint field
+    void write_adjoint_field(Grid&, int);
+    // write fun_loc
+    void write_fun(Grid&, int);
+    // write xi
+    void write_xi(Grid&, int);
+    // write eta
+    void write_eta(Grid&, int);
+    // write a
+    void write_a(Grid&, int);
+    // write b
+    void write_b(Grid&, int);
+    // write c
+    void write_c(Grid&, int);
+    // write f
+    void write_f(Grid&, int);
+    // Ks
+    void write_Ks(Grid&, int);
+    // Kxi
+    void write_Kxi(Grid&, int);
+    // Keta
+    void write_Keta(Grid&, int);
+    // Ks_density
+    void write_Ks_density(Grid&, int);
+    // Kxi_density
+    void write_Kxi_density(Grid&, int);
+    // Keta_density
+    void write_Keta_density(Grid&, int);
+    // Ks_over_Kden
+    void write_Ks_over_Kden(Grid&, int);
+    // Kxi_over_Kden
+    void write_Kxi_over_Kden(Grid&, int);
+    // Keta_over_Kden
+    void write_Keta_over_Kden(Grid&, int);
+    // Ks_update
+    void write_Ks_update(Grid&, int);
+    // Kxi_update
+    void write_Kxi_update(Grid&, int);
+    // Keta_update
+    void write_Keta_update(Grid&, int);
+    // Ks_density_update
+    void write_Ks_density_update(Grid&, int);
+    // Kxi_density_update
+    void write_Kxi_density_update(Grid&, int);
+    // Keta_density_update
+    void write_Keta_density_update(Grid&, int);
+    // Ks_descent_dir_loc
+    void write_Ks_descent_dir(Grid&, int);
+    // Kxi_descent_dir_loc
+    void write_Kxi_descent_dir(Grid&, int);
+    // Keta_descent_dir_loc
+    void write_Keta_descent_dir(Grid&, int);
+
+    // write all concerning parameters
+    std::vector<CUSTOMREAL> get_grid_data(CUSTOMREAL * data);
+    // void write_concerning_parameters(Grid&, int, InputParams&);
+
+    // 2d traveltime field for teleseismic source
+    void write_2d_travel_time_field(CUSTOMREAL*, CUSTOMREAL*, CUSTOMREAL*, int, int, CUSTOMREAL);
+    void h5_create_and_write_dataset_2d(std::string&, int, int*, int, CUSTOMREAL*);
+    void read_2d_travel_time_field(std::string&, CUSTOMREAL*, int, int);
+
+    // 2d field for 1dinv
+    void write_1dinv_field(CUSTOMREAL*, CUSTOMREAL*, CUSTOMREAL*, int, int, std::string&);
+
+    // merged model
+    void write_T_merged(Grid&, InputParams&, int);
+    void write_merged_model(Grid&, InputParams&, std::string);
+    bool node_of_this_subdomain(int*, const int&, const int&, const int&);
+
+    //
+    // read functions
+    //
+
+    // read model data
+    void read_model(std::string&, const char*, CUSTOMREAL*, int, int, int);
+    // read Travel time from file for earthquake relocation and common receiver double-difference
+    void read_T(Grid&);
+    // read Travel time from a temporal file for earthquake relocation and common receiver double-difference
+    void read_T_tmp(Grid&);
+
+    void read_data_ascii(Grid&, std::string&);
+
+    //
+    // delete functions
+    //
+    // Any deletion of data should be done on a created hdf5. Because the design of hdf5 is that
+    // it is not possible to delete data from an existing hdf5 file. The only way to do so is to
+    // create a new hdf5 file and copy the necessary data from the old one to the new one.
+    // Therefore, the deletion of data is not implemented in this class.
+    // Reference : https://stackoverflow.com/questions/1124994/removing-data-from-a-hdf5-file
+
+private:
+    // member variables
+    std::string h5_output_grid_fname = "out_data_grid.h5"; // output file name
+    std::string h5_output_fname      = "out_data.h5";      // output file name
+    std::string h5_output_fname_tmp  = "out_data_tmp.h5";  // output file name
+    std::string xdmf_output_fname    = "out_data.xmf";     // output file name
+    std::string h5_group_name_grid   = "Mesh";               // group name for grid data
+    std::string h5_group_name_event  = "Event";              // group name for event data
+    std::string h5_group_name_data   = "Data";               // group name for event data
+
+    std::string h5_dset_name_node_coords_x = "node_coords_x"; // dataset name for node coordinates
+    std::string h5_dset_name_node_coords_y = "node_coords_y"; // dataset name for node coordinates
+    std::string h5_dset_name_node_coords_z = "node_coords_z"; // dataset name for node coordinates
+    std::string h5_dset_name_node_coords_p = "node_coords_p"; // dataset name for node coordinates
+    std::string h5_dset_name_node_coords_t = "node_coords_t"; // dataset name for node coordinates
+    std::string h5_dset_name_node_coords_r = "node_coords_r"; // dataset name for node coordinates
+    std::string h5_dset_name_elem_conn     = "elem_conn";     // dataset name for element connectivity
+    std::string h5_dset_name_procid        = "procid";        // dataset name for my_procs
+
+    std::string h5_file_and_group  = h5_output_grid_fname + ":/" + h5_group_name_grid;
+    std::string h5_whole_name_conn = h5_file_and_group + "/" + h5_dset_name_elem_conn;
+    std::string h5_whole_name_x    = h5_file_and_group + "/" + h5_dset_name_node_coords_x;
+    std::string h5_whole_name_y    = h5_file_and_group + "/" + h5_dset_name_node_coords_y;
+    std::string h5_whole_name_z    = h5_file_and_group + "/" + h5_dset_name_node_coords_z;
+    std::string h5_whole_name_proc = h5_file_and_group + "/" + h5_dset_name_procid;
+
+    // index of current simulation used for naming output files and datasets
+    int id_sim_src;
+    // name of current simulation used for naming output files and datasets
+    std::string name_sim_src;
+
+    int nnodes_glob = 0;
+    int nelms_glob  = 0;
+
+    int array_rank_default = 2; // default array rank
+
+    int custom_real_flag; //  custom real type
+
+    // custom data type for real
+    std::string type_name = "Float";
+    std::string type_size = "4";
+
+    // helper for xdmf/xml file io
+    tinyxml2::XMLDocument* doc;
+    tinyxml2::XMLNode* xdmf, *domain, *grid;
+    tinyxml2::XMLNode* inversions, *inv_grid;
+    void init_xdmf_file();
+    void write_xdmf_file();
+    void finalize_xdmf_file();
+    void insert_data_xdmf(std::string&, std::string&, std::string&);
+
+#ifdef USE_HDF5
+    // h5 variables
+    hid_t  file_id;
+    hid_t  group_id;
+    hid_t  space_id;
+    hid_t  dset_id;
+    hid_t  plist_id, plist_id_dset;
+    hid_t  file_dspace_id, mem_dspace_id;
+    herr_t status;
+    hid_t  plist_id_2d;
+    hid_t  file_id_2d;
+
+    // h5 low level utilities
+    void h5_create_file_by_group_main(std::string&);
+    void h5_open_file_by_group_main(std::string&);
+    void h5_open_file_collective(std::string&);
+    void h5_open_file_collective_input(std::string&);
+    void h5_close_file_by_group_main();
+    void h5_close_file_collective();
+
+    void h5_create_group_by_group_main(std::string& );
+    void h5_open_group_by_group_main(std::string& );
+    void h5_open_group_collective(std::string& );
+    void h5_close_group_by_group_main();
+    void h5_close_group_collective();
+
+    void h5_create_dataset(std::string&, int, int*, int, bool no_group = false);
+    void h5_open_dataset(std::string&);
+    void h5_open_dataset_no_group(std::string&);
+    void h5_close_dataset();
+
+    template <typename T>
+    void h5_write_array(std::string&, int, int*, T*, int offset_one, int offset_two=0, int offset_three=0, bool no_group=false);
+
+    template <typename T>
+    void h5_read_array(std::string&, int, int*, T*, int*, bool);
+
+    template <typename T>
+    void h5_read_array_simple(std::string&,  T*);
+
+    void read_data_h5(Grid&, CUSTOMREAL*, std::string, std::string);
+#endif // HDF_HDF5
+
+    // utilities
+    std::string create_fname_ascii(std::string&);
+    std::string create_fname_ascii_model(std::string&);
+
+    // flag for data type
+    const bool model_data = true;
+    const bool src_data = false;
+
+};
+
+#endif // IO_H
--- a/include/iterator.h
+++ b/include/iterator.h
@@ -0,0 +1,161 @@
+#ifndef ITERATOR_H
+#define ITERATOR_H
+
+#include <algorithm>
+#include <initializer_list>
+#include <vector>
+#include <math.h>
+#include "grid.h"
+#include "input_params.h"
+#include "utils.h"
+#include "source.h"
+#include "io.h"
+#include "timer.h"
+#include "eikonal_solver_2d.h"
+
+#ifdef USE_CUDA
+#include "grid_wrapper.cuh"
+#include "iterator_wrapper.cuh"
+#endif
+
+//#ifdef USE_SIMD
+#include "simd_conf.h"
+//#endif
+
+class Iterator {
+public:
+    Iterator(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+    virtual ~Iterator();
+    // regional source
+    void run_iteration_forward(InputParams&, Grid&, IO_utils&, bool&); // run forward iteration till convergence
+    void run_iteration_adjoint(InputParams&, Grid&, IO_utils&, int);        // run adjoint iteration till convergence
+
+    void initialize_arrays(InputParams&, IO_utils&, Grid&, Source&, const std::string&); // initialize factors etc.
+
+protected:
+    void assign_processes_for_levels(Grid&, InputParams&); // assign intra-node processes for each sweeping level
+    void set_sweep_direction(int);                         // set sweep direction
+    // regional source
+    virtual void do_sweep(int, Grid&, InputParams&){};                // do sweeping with ordinal method
+    void calculate_stencil_1st_order(Grid&, int&, int&, int&);        // calculate stencil for 1st order
+    void calculate_stencil_3rd_order(Grid&, int&, int&, int&);        // calculate stencil for 3rd order
+    void calculate_stencil_1st_order_upwind(Grid&, int&, int&, int&); // calculate stencil for 1st order in upwind form
+    void calculate_boundary_nodes(Grid&);                             // calculate boundary values
+    // teleseismic source
+    void calculate_stencil_1st_order_tele(Grid&, int&, int&, int&);        // calculate stencil for 1st order
+    void calculate_stencil_3rd_order_tele(Grid&, int&, int&, int&);        // calculate stencil for 3rd order
+    void calculate_stencil_1st_order_upwind_tele(Grid&, int&, int&, int&); // calculate stencil for 1st order in upwind form
+    void calculate_boundary_nodes_tele(Grid&, int&, int&, int&);           // calculate boundary values for teleseismic source
+
+    void calculate_boundary_nodes_adj(Grid&, int&, int&, int&);       // calculate boundary values for adjoint source (all zeros)
+
+    // Hamiltonian calculation
+    inline CUSTOMREAL calc_LF_Hamiltonian(Grid&, CUSTOMREAL& ,CUSTOMREAL& , \
+                                                 CUSTOMREAL& ,CUSTOMREAL& , \
+                                                 CUSTOMREAL& ,CUSTOMREAL&, \
+                                                 int&, int&, int& );
+    inline CUSTOMREAL calc_LF_Hamiltonian_tele(Grid&, CUSTOMREAL& ,CUSTOMREAL& , \
+                                                 CUSTOMREAL& ,CUSTOMREAL& , \
+                                                 CUSTOMREAL& ,CUSTOMREAL&, \
+                                                 int&, int&, int& );
+
+    // methods for adjoint field calculation
+    void init_delta_and_Tadj(Grid&, InputParams&);                     // initialize delta and Tadj
+    void init_delta_and_Tadj_density(Grid&, InputParams&);             // initialize delta and Tadj_density
+    void fix_boundary_Tadj(Grid&);                                     // fix boundary values for Tadj
+    virtual void do_sweep_adj(int, Grid&, InputParams&){};             // do sweeping with ordinal method for adjoint field
+    void calculate_stencil_adj(Grid&, int&, int&, int&);               // calculate stencil for 1st order for adjoint field
+
+    // grid point information
+    int* _nr, *_nt, *_np;           // number of grid points on the direction r, theta, phi
+    CUSTOMREAL* _dr, *_dt, *_dp;    // grid spacing on the direction r, theta, phi
+    int nr, nt, np;                 // number of grid points on the direction r, theta, phi
+    CUSTOMREAL dr, dt, dp;          // grid spacing on the direction r, theta, phi
+    int st_level;                   // start level for sweeping
+    int ed_level;                   // end level for sweeping
+    std::vector< std::vector<int> > ijk_for_this_subproc; // ijk=I2V(i,j,k) for this process (level, ijk)
+    int max_n_nodes_plane;                                // maximum number of nodes on a plane
+
+
+#if defined USE_SIMD || defined USE_CUDA
+    // stencil dumps
+    // first orders
+    CUSTOMREAL *dump_c__;// center of C
+    // all grid data expect tau pre-load strategy (iswap, ilevel, inodes)
+#if USE_AVX512 || USE_AVX || USE_NEON || defined USE_CUDA
+    std::vector<std::vector<int*>> vv_i__j__k__, vv_ip1j__k__, vv_im1j__k__, vv_i__jp1k__, vv_i__jm1k__, vv_i__j__kp1, vv_i__j__km1;
+    std::vector<std::vector<int*>>               vv_ip2j__k__, vv_im2j__k__, vv_i__jp2k__, vv_i__jm2k__, vv_i__j__kp2, vv_i__j__km2;
+#elif USE_ARM_SVE
+    std::vector<std::vector<uint64_t*>> vv_i__j__k__, vv_ip1j__k__, vv_im1j__k__, vv_i__jp1k__, vv_i__jm1k__, vv_i__j__kp1, vv_i__j__km1;
+    std::vector<std::vector<uint64_t*>>               vv_ip2j__k__, vv_im2j__k__, vv_i__jp2k__, vv_i__jm2k__, vv_i__j__kp2, vv_i__j__km2;
+#endif
+    std::vector<std::vector<CUSTOMREAL*>> vv_iip, vv_jjt, vv_kkr;
+
+    std::vector<std::vector<CUSTOMREAL*>> vv_fac_a, vv_fac_b, vv_fac_c, vv_fac_f, vv_T0v, vv_T0r, vv_T0t, vv_T0p, vv_fun;
+    std::vector<std::vector<CUSTOMREAL*>>vv_change;
+    std::vector<std::vector<bool*>>vv_change_bl;
+
+    template <typename T>
+    void preload_indices(std::vector<std::vector<T*>> &vi, std::vector<std::vector<T*>> &, std::vector<std::vector<T*>> &, int, int, int);
+    template <typename T>
+    void preload_indices_1d(std::vector<std::vector<T*>> &, int, int, int);
+    template <typename T>
+    std::vector<std::vector<CUSTOMREAL*>> preload_array(T* a);
+    std::vector<std::vector<bool*>> preload_array_bl(bool* a);
+    template <typename T>
+    void free_preloaded_array(std::vector<std::vector<T*>> &vvv){
+        for (int iswap = 0; iswap < 8; iswap++){
+            for (auto& vv : vvv.at(iswap)) free(vv);
+        }
+    }
+    // flag for deallocation
+    bool simd_allocated     = false;
+    bool simd_allocated_3rd = false;
+
+#endif // USE_SIMD || USE_CUDA
+
+#ifdef USE_CUDA
+    Grid_on_device *gpu_grid;
+#endif
+
+
+    const int nswp = 8;          // number of sweeping directions
+    int r_dirc, t_dirc, p_dirc;  // sweeping directions
+    CUSTOMREAL sigr, sigt, sigp; //
+    CUSTOMREAL coe;
+    CUSTOMREAL wp1, pp1, wp2, pp2;
+    CUSTOMREAL wt1, pt1, wt2, pt2;
+    CUSTOMREAL wr1, pr1, wr2, pr2;
+    CUSTOMREAL Htau, tpT;
+
+    CUSTOMREAL ap1, bp1, ap2, bp2, ap, bp;
+    CUSTOMREAL at1, bt1, at2, bt2, at, bt;
+    CUSTOMREAL ar1, br1, ar2, br2, ar, br;
+
+    CUSTOMREAL bc_f2, eqn_a, eqn_b, eqn_c, eqn_Delta;
+    CUSTOMREAL tmp_tau, tmp_T;
+    CUSTOMREAL T_r, T_t, T_p, charact_r, charact_t, charact_p;
+    bool is_causality;
+    int count_cand;
+    std::vector<CUSTOMREAL> canditate = std::vector<CUSTOMREAL>(60);
+
+
+    // iteration control
+    int iter_count = 0;
+    CUSTOMREAL ini_diff_L1 = HUGE_VAL, ini_diff_Linf = HUGE_VAL;
+    CUSTOMREAL ini_err_L1  = HUGE_VAL, ini_err_Linf  = HUGE_VAL;
+    CUSTOMREAL cur_diff_L1 = HUGE_VAL, cur_diff_Linf = HUGE_VAL;
+    CUSTOMREAL cur_err_L1  = HUGE_VAL, cur_err_Linf  = HUGE_VAL;
+
+    // teleseismic flag
+    bool is_teleseismic = false;
+
+    // second run for hybrid order method
+    bool is_second_run = false;
+
+};
+
+// define derived classes for each iteration scheme
+
+
+#endif // ITERATOR_H
--- a/include/iterator_legacy.h
+++ b/include/iterator_legacy.h
@@ -0,0 +1,82 @@
+#ifndef ITERATOR_legacy_H
+#define ITERATOR_legacy_H
+
+#include "iterator.h"
+
+
+class Iterator_legacy : public Iterator {
+public:
+    Iterator_legacy(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+protected:
+    void do_sweep_adj(int, Grid&, InputParams&) override ; // do sweeping for adjoint routine
+    virtual void do_sweep(int, Grid&, InputParams&) {}; // do sweeping
+};
+
+
+class Iterator_legacy_tele : public Iterator {
+public:
+    Iterator_legacy_tele(InputParams& , Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+protected:
+    void do_sweep_adj(int, Grid&, InputParams&) override ; // do sweeping for adjoint routine
+    virtual void do_sweep(int, Grid&, InputParams&) {}; // do sweeping
+};
+
+
+class Iterator_legacy_1st_order : public Iterator_legacy {
+
+public:
+    Iterator_legacy_1st_order(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+
+};
+
+class Iterator_legacy_3rd_order : public Iterator_legacy {
+
+public:
+    Iterator_legacy_3rd_order(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+    //void run_iteration_forward(InputParams&, Grid&, IO_utils&, bool&);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+
+};
+
+class Iterator_legacy_1st_order_upwind : public Iterator_legacy {
+
+public:
+    Iterator_legacy_1st_order_upwind(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+class Iterator_legacy_1st_order_tele : public Iterator_legacy_tele {
+
+public:
+    Iterator_legacy_1st_order_tele(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+    //void run_iteration_forward(InputParams&, Grid&, IO_utils&, bool&);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+
+};
+
+class Iterator_legacy_3rd_order_tele : public Iterator_legacy_tele {
+
+public:
+    Iterator_legacy_3rd_order_tele(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+    //void run_iteration_forward(InputParams&, Grid&, IO_utils&, bool&);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+
+};
+
+class Iterator_legacy_1st_order_upwind_tele : public Iterator_legacy_tele {
+
+public:
+    Iterator_legacy_1st_order_upwind_tele(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+    //void run_iteration_forward(InputParams&, Grid&, IO_utils&, bool&);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+
+};
+
+#endif // ITERATOR_LEGACY_H
--- a/include/iterator_level.h
+++ b/include/iterator_level.h
@@ -0,0 +1,65 @@
+#ifndef ITERATOR_LEVEL_H
+#define ITERATOR_LEVEL_H
+
+#include "iterator.h"
+
+
+class Iterator_level : public Iterator {
+public:
+    Iterator_level(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+protected:
+    void do_sweep_adj(int, Grid&, InputParams&) override ; // do sweeping for adjoint routine
+    virtual void do_sweep(int, Grid&, InputParams&) {}; // do sweeping
+};
+
+class Iterator_level_tele : public Iterator {
+public:
+    Iterator_level_tele(InputParams& , Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+protected:
+    void do_sweep_adj(int, Grid&, InputParams&) override ; // do sweeping for adjoint routine
+    virtual void do_sweep(int, Grid&, InputParams&) {}; // do sweeping
+};
+
+class Iterator_level_1st_order : public Iterator_level {
+public:
+    Iterator_level_1st_order(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+class Iterator_level_3rd_order : public Iterator_level {
+public:
+    Iterator_level_3rd_order(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+class Iterator_level_1st_order_upwind : public Iterator_level {
+public:
+    Iterator_level_1st_order_upwind(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+class Iterator_level_1st_order_tele : public Iterator_level_tele {
+public:
+    Iterator_level_1st_order_tele(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+class Iterator_level_3rd_order_tele : public Iterator_level_tele {
+public:
+    Iterator_level_3rd_order_tele(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+class Iterator_level_1st_order_upwind_tele : public Iterator_level_tele {
+public:
+    Iterator_level_1st_order_upwind_tele(InputParams&, Grid&, Source&, IO_utils&, const std::string&, bool, bool, bool);
+private:
+    void do_sweep(int, Grid&, InputParams&) override ; // do sweeping
+};
+
+#endif // ITERATOR_LEVEL_H
--- a/include/iterator_selector.h
+++ b/include/iterator_selector.h
@@ -0,0 +1,98 @@
+#ifndef ITERATOR_SELECTOR_H
+#define ITERATOR_SELECTOR_H
+
+#include <memory>
+#include "iterator.h"
+#include "iterator_legacy.h"
+#include "iterator_level.h"
+
+
+void select_iterator(InputParams& IP, Grid& grid, Source& src, IO_utils& io, const std::string& src_name, \
+                     bool first_init, bool is_teleseismic, std::unique_ptr<Iterator>& It, bool is_second_run) {
+    // initialize iterator object
+
+    if (!is_teleseismic){
+        if (IP.get_sweep_type() == SWEEP_TYPE_LEGACY) {
+            if (IP.get_stencil_order() == 1){
+                if (IP.get_stencil_type() == UPWIND){
+                    It = std::make_unique<Iterator_legacy_1st_order_upwind>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                } else {
+                    It = std::make_unique<Iterator_legacy_1st_order>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                }
+            } else if (IP.get_stencil_order() == 3){
+                It = std::make_unique<Iterator_legacy_3rd_order>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                if (IP.get_stencil_type() == UPWIND){
+                    std::cout << "WARNING: Upwind Stencil type not supported, using 3rd order non upwind scheme (LF)" << std::endl;
+                }
+            } else{
+                std::cout << "ERROR: Stencil order not supported" << std::endl;
+                exit(1);
+            }
+        } else if (IP.get_sweep_type() == SWEEP_TYPE_LEVEL){
+            if (IP.get_stencil_order() == 1){
+                if (IP.get_stencil_type() == UPWIND){
+                    // std::cout << "WARNING: Upwind Stencil type not supported, using non upwind scheme (LF)" << std::endl;
+                    It = std::make_unique<Iterator_level_1st_order_upwind>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                } else {
+                    It = std::make_unique<Iterator_level_1st_order>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                }
+            } else if (IP.get_stencil_order() == 3){
+                It = std::make_unique<Iterator_level_3rd_order>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                if (IP.get_stencil_type() == UPWIND){
+                    std::cout << "WARNING: Upwind Stencil type not supported, using 3rd order non upwind scheme (LF)" << std::endl;
+                }
+            } else{
+                std::cout << "ERROR: Stencil order not supported" << std::endl;
+                exit(1);
+            }
+        } else {
+            std::cout << "ERROR: Sweep type not supported" << std::endl;
+            exit(1);
+        }
+    } else { // teleseismic event
+        if (IP.get_sweep_type() == SWEEP_TYPE_LEGACY) {
+            if (IP.get_stencil_order() == 1){
+                if (IP.get_stencil_type() == UPWIND){
+                    It = std::make_unique<Iterator_legacy_1st_order_upwind_tele>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                } else {
+                    It = std::make_unique<Iterator_legacy_1st_order_tele>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                }
+            } else if (IP.get_stencil_order() == 3){
+                It = std::make_unique<Iterator_legacy_3rd_order_tele>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                if (IP.get_stencil_type() == UPWIND){
+                    std::cout << "WARNING: Upwind Stencil type not supported, using non upwind scheme (LF)" << std::endl;
+                }
+            } else{
+                std::cout << "ERROR: Stencil order not supported" << std::endl;
+                exit(1);
+            }
+        } else if (IP.get_sweep_type() == SWEEP_TYPE_LEVEL){
+            if (IP.get_stencil_order() == 1){
+                if (IP.get_stencil_type() == UPWIND){
+                    It = std::make_unique<Iterator_level_1st_order_upwind_tele>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                } else {
+                    It = std::make_unique<Iterator_level_1st_order_tele>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                    std::cout << "WARNING: SWEEP_TYPE_LEVEL for non-unwind solver does not work. Index wrong currently" << std::endl;
+                    exit(1);
+                }
+            } else if (IP.get_stencil_order() == 3){
+                It = std::make_unique<Iterator_level_3rd_order_tele>(IP, grid, src, io, src_name, first_init, is_teleseismic, is_second_run);
+                std::cout << "WARNING: SWEEP_TYPE_LEVEL for non-unwind solver does not work. Index wrong currently" << std::endl;
+                exit(1);
+                if (IP.get_stencil_type() == UPWIND){
+                    std::cout << "WARNING: Upwind Stencil type not supported, using non upwind scheme (LF)" << std::endl;
+                }
+            } else{
+                std::cout << "ERROR: Stencil order not supported" << std::endl;
+                exit(1);
+            }
+        } else {
+            std::cout << "ERROR: Sweep type not supported" << std::endl;
+            exit(1);
+        }
+    }
+
+}
+
+#endif
+
--- a/include/kernel.h
+++ b/include/kernel.h
@@ -0,0 +1,226 @@
+#ifndef KERNEL_H
+#define KERNEL_H
+
+#include "config.h"
+#include "grid.h"
+#include "input_params.h"
+
+void calculate_sensitivity_kernel(Grid& grid, InputParams& IP, const std::string& name_sim_src){
+    // calculate sensitivity kernel
+
+    // kernel calculation will be done only by the subdom_main
+    if (subdom_main) {
+        // get the necessary parameters
+        int np             = loc_I;
+        int nt             = loc_J;
+        int nr             = loc_K;
+        CUSTOMREAL dr      = grid.dr;
+        CUSTOMREAL dt      = grid.dt;
+        CUSTOMREAL dp      = grid.dp;
+        CUSTOMREAL src_lon = IP.get_src_lon(   name_sim_src);
+        CUSTOMREAL src_lat = IP.get_src_lat(   name_sim_src);
+        CUSTOMREAL src_r   = IP.get_src_radius(name_sim_src);
+
+        CUSTOMREAL weight   = _1_CR;
+
+        // inner points
+        for (int kkr = 1; kkr < nr-1; kkr++) {
+            for (int jjt = 1; jjt < nt-1; jjt++) {
+                for (int iip = 1; iip < np-1; iip++) {
+
+                    // calculate the kernel
+                    CUSTOMREAL Tr_km     = (grid.T_loc[I2V(iip,jjt,kkr+1)] - grid.T_loc[I2V(iip,jjt,kkr-1)]) / (_2_CR * dr);
+                    CUSTOMREAL Ttheta_km = (grid.T_loc[I2V(iip,jjt+1,kkr)] - grid.T_loc[I2V(iip,jjt-1,kkr)]) / (_2_CR * dt) / grid.r_loc_1d[kkr];
+                    CUSTOMREAL Tphi_km   = (grid.T_loc[I2V(iip+1,jjt,kkr)] - grid.T_loc[I2V(iip-1,jjt,kkr)]) / (_2_CR * dp) / (grid.r_loc_1d[kkr]*std::cos(grid.t_loc_1d[jjt]));
+
+                    CUSTOMREAL azi_ratio = std::sqrt((my_square(Ttheta_km) + my_square(Tphi_km))/(my_square(Tr_km) + my_square(Ttheta_km) + my_square(Tphi_km)));
+                    
+                    // mask within one grid around the source
+                    if ((std::abs(grid.r_loc_1d[kkr] - src_r) >= dr) ||
+                        (std::abs(grid.t_loc_1d[jjt] - src_lat) >= dt) ||
+                        (std::abs(grid.p_loc_1d[iip] - src_lon) >= dp)) {
+                        // density of ks
+                        grid.Ks_density_loc[I2V(iip,jjt,kkr)]   += weight * grid.Tadj_density_loc[I2V(iip,jjt,kkr)];  
+                        
+                        // density of kxi
+                        // grid.Kxi_density_loc[I2V(iip,jjt,kkr)] += weight * grid.Tadj_density_loc[I2V(iip,jjt,kkr)]; 
+                        grid.Kxi_density_loc[I2V(iip,jjt,kkr)]  += weight * grid.Tadj_density_loc[I2V(iip,jjt,kkr)] * azi_ratio;
+                                                                
+                        // density of keta
+                        // grid.Keta_density_loc[I2V(iip,jjt,kkr)] += weight * grid.Tadj_density_loc[I2V(iip,jjt,kkr)]; 
+                        grid.Keta_density_loc[I2V(iip,jjt,kkr)] += weight * grid.Tadj_density_loc[I2V(iip,jjt,kkr)] * azi_ratio;
+        
+
+                        if (IP.get_update_slowness()==1){      // we need to update slowness
+                            // Kernel w r t slowness s
+                            grid.Ks_loc[I2V(iip,jjt,kkr)] += weight * grid.Tadj_loc[I2V(iip,jjt,kkr)] * my_square(grid.fun_loc[I2V(iip,jjt,kkr)]);                                                  
+                        } else {
+                            grid.Ks_loc[I2V(iip,jjt,kkr)] = _0_CR;
+                        }
+
+
+                        if (IP.get_update_azi_ani()){      // we need to update azimuthal anisotropy
+                            // Kernel w r t anisotrophy xi
+                            if (isZero(std::sqrt(my_square(grid.xi_loc[I2V(iip,jjt,kkr)])+my_square(grid.eta_loc[I2V(iip,jjt,kkr)])))) {
+                                grid.Kxi_loc[I2V(iip,jjt,kkr)]  += weight * grid.Tadj_loc[I2V(iip,jjt,kkr)] \
+                                                                * (my_square(Ttheta_km) - my_square(Tphi_km));
+
+                                grid.Keta_loc[I2V(iip,jjt,kkr)] += weight * grid.Tadj_loc[I2V(iip,jjt,kkr)] \
+                                                                * ( -_2_CR * Ttheta_km * Tphi_km );
+
+                            } else {
+                                grid.Kxi_loc[I2V(iip,jjt,kkr)]  += weight * grid.Tadj_loc[I2V(iip,jjt,kkr)] \
+                                                            * ((- GAMMA * grid.xi_loc[I2V(iip,jjt,kkr)] / \
+                                                                        std::sqrt(my_square(grid.xi_loc[ I2V(iip,jjt,kkr)]) + my_square(grid.eta_loc[I2V(iip,jjt,kkr)]))) * my_square(Tr_km) \
+                                                                + my_square(Ttheta_km) 
+                                                                - my_square(Tphi_km));
+
+                                grid.Keta_loc[I2V(iip,jjt,kkr)] += weight * grid.Tadj_loc[I2V(iip,jjt,kkr)] \
+                                                            * (( - GAMMA * grid.eta_loc[I2V(iip,jjt,kkr)]/ \
+                                                                            std::sqrt(my_square(grid.xi_loc[I2V(iip,jjt,kkr)]) + my_square(grid.eta_loc[I2V(iip,jjt,kkr)]))) * my_square(Tr_km) \
+                                                                    - _2_CR * Ttheta_km * Tphi_km );
+                            }
+                        } else {
+                            grid.Kxi_loc[I2V(iip,jjt,kkr)]  = _0_CR;
+                            grid.Keta_loc[I2V(iip,jjt,kkr)] = _0_CR;
+                        }
+
+                    } else{
+                        grid.Ks_loc[I2V(iip,jjt,kkr)]   += _0_CR;
+                        grid.Kxi_loc[I2V(iip,jjt,kkr)]  += _0_CR;
+                        grid.Keta_loc[I2V(iip,jjt,kkr)] += _0_CR;
+
+                        grid.Ks_density_loc[I2V(iip,jjt,kkr)]   += _0_CR;
+                        grid.Kxi_density_loc[I2V(iip,jjt,kkr)]  += _0_CR;
+                        grid.Keta_density_loc[I2V(iip,jjt,kkr)] += _0_CR;
+        
+                    }
+                }
+            }
+        }
+
+        // boundary
+        for (int kkr = 0; kkr < nr; kkr++) {
+            for (int jjt = 0; jjt < nt; jjt++) {
+                // set Ks Kxi Keta to zero
+                if (grid.i_first()){
+                    grid.Ks_loc[I2V(0,jjt,kkr)]             = _0_CR;
+                    grid.Kxi_loc[I2V(0,jjt,kkr)]            = _0_CR;
+                    grid.Keta_loc[I2V(0,jjt,kkr)]           = _0_CR;
+                    grid.Ks_density_loc[I2V(0,jjt,kkr)]     = _0_CR;
+                    grid.Kxi_density_loc[I2V(0,jjt,kkr)]    = _0_CR;
+                    grid.Keta_density_loc[I2V(0,jjt,kkr)]   = _0_CR;
+                }
+                if (grid.i_last()){
+                    grid.Ks_loc[I2V(np-1,jjt,kkr)]          = _0_CR;
+                    grid.Kxi_loc[I2V(np-1,jjt,kkr)]         = _0_CR;
+                    grid.Keta_loc[I2V(np-1,jjt,kkr)]        = _0_CR;
+                    grid.Ks_density_loc[I2V(np-1,jjt,kkr)]  = _0_CR;
+                    grid.Kxi_density_loc[I2V(np-1,jjt,kkr)] = _0_CR;
+                    grid.Keta_density_loc[I2V(np-1,jjt,kkr)]= _0_CR;
+                }
+           }
+        }
+        for (int kkr = 0; kkr < nr; kkr++) {
+            for (int iip = 0; iip < np; iip++) {
+                // set Ks Kxi Keta to zero
+                if (grid.j_first()){
+                    grid.Ks_loc[I2V(iip,0,kkr)]             = _0_CR;
+                    grid.Kxi_loc[I2V(iip,0,kkr)]            = _0_CR;
+                    grid.Keta_loc[I2V(iip,0,kkr)]           = _0_CR;
+                    grid.Ks_density_loc[I2V(iip,0,kkr)]     = _0_CR;
+                    grid.Kxi_density_loc[I2V(iip,0,kkr)]    = _0_CR;
+                    grid.Keta_density_loc[I2V(iip,0,kkr)]   = _0_CR;
+                }
+                if (grid.j_last()){
+                    grid.Ks_loc[I2V(iip,nt-1,kkr)]          = _0_CR;
+                    grid.Kxi_loc[I2V(iip,nt-1,kkr)]         = _0_CR;
+                    grid.Keta_loc[I2V(iip,nt-1,kkr)]        = _0_CR;
+                    grid.Ks_density_loc[I2V(iip,nt-1,kkr)]  = _0_CR;
+                    grid.Kxi_density_loc[I2V(iip,nt-1,kkr)] = _0_CR;
+                    grid.Keta_density_loc[I2V(iip,nt-1,kkr)]= _0_CR;
+                }
+            }
+        }
+        for (int jjt = 0; jjt < nt; jjt++) {
+            for (int iip = 0; iip < np; iip++) {
+                // set Ks Kxi Keta to zero
+                if (grid.k_first()){
+                    grid.Ks_loc[I2V(iip,jjt,0)]             = _0_CR;
+                    grid.Kxi_loc[I2V(iip,jjt,0)]            = _0_CR;
+                    grid.Keta_loc[I2V(iip,jjt,0)]           = _0_CR;
+                    grid.Ks_density_loc[I2V(iip,jjt,0)]     = _0_CR;
+                    grid.Kxi_density_loc[I2V(iip,jjt,0)]    = _0_CR;
+                    grid.Keta_density_loc[I2V(iip,jjt,0)]   = _0_CR;
+                }
+                if (grid.k_last()){
+                    grid.Ks_loc[I2V(iip,jjt,nr-1)]          = _0_CR;
+                    grid.Kxi_loc[I2V(iip,jjt,nr-1)]         = _0_CR;
+                    grid.Keta_loc[I2V(iip,jjt,nr-1)]        = _0_CR;
+                    grid.Ks_density_loc[I2V(iip,jjt,nr-1)]  = _0_CR;
+                    grid.Kxi_density_loc[I2V(iip,jjt,nr-1)] = _0_CR;
+                    grid.Keta_density_loc[I2V(iip,jjt,nr-1)]= _0_CR;
+                }
+            }
+        }
+
+    } // end if subdom_main
+}
+
+
+void check_kernel_density(Grid& grid, InputParams& IP) {
+    if(subdom_main){
+        // check local kernel density positivity
+        for (int i_loc = 0; i_loc < loc_I; i_loc++) {
+            for (int j_loc = 0; j_loc < loc_J; j_loc++) {
+                for (int k_loc = 0; k_loc < loc_K; k_loc++) {
+                    if (isNegative(grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)])){
+                        std::cout   << "Warning, id_sim: " << id_sim << ", grid.Ks_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " 
+                                    << grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)]   
+                                    << std::endl;
+                        // // print the source name
+                        // for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+                        //     // get source info
+                        //     const std::string name_sim_src   = IP.get_src_name(i_src);   // get_src_name must be run by all processes
+                        //     std::cout   << "id_sim: " << id_sim << ", "
+                        //                 << i_src + 1 << "/" << IP.n_src_this_sim_group << ", source name: " << name_sim_src << std::endl;
+                        // }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+void sumup_kernels(Grid& grid) {
+    if(subdom_main){
+        int n_grids = loc_I*loc_J*loc_K;
+
+        allreduce_cr_sim_inplace(grid.Ks_loc, n_grids);
+        allreduce_cr_sim_inplace(grid.Kxi_loc, n_grids);
+        allreduce_cr_sim_inplace(grid.Keta_loc, n_grids);
+        allreduce_cr_sim_inplace(grid.Ks_density_loc, n_grids);
+        allreduce_cr_sim_inplace(grid.Kxi_density_loc, n_grids);
+        allreduce_cr_sim_inplace(grid.Keta_density_loc, n_grids);
+
+        // share the values on boundary
+        grid.send_recev_boundary_data(grid.Ks_loc);
+        grid.send_recev_boundary_data(grid.Kxi_loc);
+        grid.send_recev_boundary_data(grid.Keta_loc);
+        grid.send_recev_boundary_data(grid.Ks_density_loc);
+        grid.send_recev_boundary_data(grid.Kxi_density_loc);
+        grid.send_recev_boundary_data(grid.Keta_density_loc);
+
+        grid.send_recev_boundary_data_kosumi(grid.Ks_loc);
+        grid.send_recev_boundary_data_kosumi(grid.Kxi_loc);
+        grid.send_recev_boundary_data_kosumi(grid.Keta_loc);
+        grid.send_recev_boundary_data_kosumi(grid.Ks_density_loc);
+        grid.send_recev_boundary_data_kosumi(grid.Kxi_density_loc);
+        grid.send_recev_boundary_data_kosumi(grid.Keta_density_loc);
+    }
+
+    synchronize_all_world();
+}
+
+#endif
--- a/include/lbfgs.h
+++ b/include/lbfgs.h
@@ -0,0 +1,577 @@
+#ifndef LBFGS_H
+#define LBFGS_H
+
+#include "config.h"
+#include "utils.h"
+#include "grid.h"
+#include "mpi_funcs.h"
+
+inline CUSTOMREAL volume_domain      = _0_CR;     // volume of the domain
+inline CUSTOMREAL weight_Tikonov     = _0_CR;     // weight of the regularization term
+inline CUSTOMREAL weight_Tikonov_ani = _0_CR;     // weight of the regularization term for anisotropic regularization
+inline int        N_params           = 3;         // number of parameters to invert
+inline CUSTOMREAL q_0                = _0_CR;     // initial cost function
+inline CUSTOMREAL qp_0               = _0_CR;     // store initial p_k * grad(f_k) (descent_direction * gradient)
+inline int        damp_weight_fun    = _1_CR;     // damp weights for fun
+inline int        damp_weight_eta    = _1_CR;     // damp weights for eta
+inline int        damp_weight_xi     = _1_CR;     // damp weights for xi
+
+
+void store_model_and_gradient(Grid& grid, int i_inv) {
+
+    if (subdom_main && id_sim==0) {
+
+        if (i_inv < Mbfgs){
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        grid.Ks_model_store_loc[  I2VLBFGS(i_inv,i,j,k)] = grid.fun_loc[I2V(i,j,k)];
+                        grid.Keta_model_store_loc[I2VLBFGS(i_inv,i,j,k)] = grid.eta_loc[I2V(i,j,k)];
+                        grid.Kxi_model_store_loc[ I2VLBFGS(i_inv,i,j,k)] = grid.xi_loc[ I2V(i,j,k)];
+                        grid.Ks_grad_store_loc[   I2VLBFGS(i_inv,i,j,k)] = grid.Ks_update_loc[  I2V(i,j,k)];
+                        grid.Keta_grad_store_loc[ I2VLBFGS(i_inv,i,j,k)] = grid.Keta_update_loc[I2V(i,j,k)];
+                        grid.Kxi_grad_store_loc[  I2VLBFGS(i_inv,i,j,k)] = grid.Kxi_update_loc[ I2V(i,j,k)];
+                    }
+                }
+            }
+
+        } else {
+            // replace the current stored models and gradients
+            for (int istore = 0; istore < Mbfgs-1; istore++){
+                for (int k = 0; k < loc_K; k++) {
+                    for (int j = 0; j < loc_J; j++) {
+                        for (int i = 0; i < loc_I; i++) {
+                            grid.Ks_model_store_loc[  I2VLBFGS(istore,i,j,k)] = grid.Ks_model_store_loc[  I2VLBFGS(istore+1,i,j,k)];
+                            grid.Keta_model_store_loc[I2VLBFGS(istore,i,j,k)] = grid.Keta_model_store_loc[I2VLBFGS(istore+1,i,j,k)];
+                            grid.Kxi_model_store_loc[ I2VLBFGS(istore,i,j,k)] = grid.Kxi_model_store_loc[ I2VLBFGS(istore+1,i,j,k)];
+                            grid.Ks_grad_store_loc[   I2VLBFGS(istore,i,j,k)] = grid.Ks_grad_store_loc[   I2VLBFGS(istore+1,i,j,k)];
+                            grid.Keta_grad_store_loc[ I2VLBFGS(istore,i,j,k)] = grid.Keta_grad_store_loc[ I2VLBFGS(istore+1,i,j,k)];
+                            grid.Kxi_grad_store_loc[  I2VLBFGS(istore,i,j,k)] = grid.Kxi_grad_store_loc[  I2VLBFGS(istore+1,i,j,k)];
+                        }
+                    }
+                }
+            }
+
+            // store new model and gradient
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        grid.Ks_model_store_loc[  I2VLBFGS(Mbfgs-1,i,j,k)] = grid.fun_loc[I2V(i,j,k)];
+                        grid.Keta_model_store_loc[I2VLBFGS(Mbfgs-1,i,j,k)] = grid.eta_loc[I2V(i,j,k)];
+                        grid.Kxi_model_store_loc[ I2VLBFGS(Mbfgs-1,i,j,k)] = grid.xi_loc[ I2V(i,j,k)];
+                        grid.Ks_grad_store_loc[   I2VLBFGS(Mbfgs-1,i,j,k)] = grid.Ks_update_loc[  I2V(i,j,k)];
+                        grid.Keta_grad_store_loc[ I2VLBFGS(Mbfgs-1,i,j,k)] = grid.Keta_update_loc[I2V(i,j,k)];
+                        grid.Kxi_grad_store_loc[  I2VLBFGS(Mbfgs-1,i,j,k)] = grid.Kxi_update_loc[ I2V(i,j,k)];
+                    }
+                }
+            }
+        }
+
+   } // end subdom main and id_sim==0
+
+}
+
+
+void calculate_descent_direction_lbfgs(Grid& grid, int i_inv) {
+
+    if (subdom_main && id_sim==0) {
+        int imin = 0;
+        int imax = 0;
+        if (i_inv >= Mbfgs)
+            imax = Mbfgs-2;
+        else
+            imax = i_inv-1;
+
+        CUSTOMREAL* ak_store = allocateMemory<CUSTOMREAL>(imax-imin+1, 2000);
+        CUSTOMREAL* pk_store = allocateMemory<CUSTOMREAL>(imax-imin+1, 2001);
+
+        CUSTOMREAL* desc_wks_Ks   = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2002);
+        CUSTOMREAL* desc_wks_Keta = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2003);
+        CUSTOMREAL* desc_wks_Kxi  = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2004);
+
+        CUSTOMREAL* wks_1_Ks   = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2005);
+        CUSTOMREAL* wks_1_Keta = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2006);
+        CUSTOMREAL* wks_1_Kxi  = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2007);
+        CUSTOMREAL* wks_2_Ks   = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2008);
+        CUSTOMREAL* wks_2_Keta = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2009);
+        CUSTOMREAL* wks_2_Kxi  = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 2010);
+
+        std::cout << "imax, imin: " << imax << ", " << imin << std::endl;
+
+        // initialize
+        for (int i = 0; i < imax-imin+1; i++) {
+            ak_store[i] = 0.0; // alpha
+            pk_store[i] = 0.0; //rho
+        }
+
+        for (int k = 0; k < loc_K; k++) {
+            for (int j = 0; j < loc_J; j++) {
+                for (int i = 0; i < loc_I; i++) {
+                    desc_wks_Ks[I2V(i,j,k)]   = grid.Ks_grad_store_loc[  I2VLBFGS(imax+1,i,j,k)]; // d
+                    desc_wks_Keta[I2V(i,j,k)] = grid.Keta_grad_store_loc[I2VLBFGS(imax+1,i,j,k)]; // d
+                    desc_wks_Kxi[I2V(i,j,k)]  = grid.Kxi_grad_store_loc[ I2VLBFGS(imax+1,i,j,k)]; // d
+                    wks_1_Ks[I2V(i,j,k)]   = grid.Ks_grad_store_loc[  I2VLBFGS(imax+1,i,j,k)] - grid.Ks_grad_store_loc[  I2VLBFGS(imax,i,j,k)];
+                    wks_1_Keta[I2V(i,j,k)] = grid.Keta_grad_store_loc[I2VLBFGS(imax+1,i,j,k)] - grid.Keta_grad_store_loc[I2VLBFGS(imax,i,j,k)];
+                    wks_1_Kxi[I2V(i,j,k)]  = grid.Kxi_grad_store_loc[ I2VLBFGS(imax+1,i,j,k)] - grid.Kxi_grad_store_loc[ I2VLBFGS(imax,i,j,k)];
+                }
+            }
+        }
+
+        //calculate l2 norms
+        CUSTOMREAL norm_yiter = 0.0;
+
+        norm_yiter += calc_l2norm(wks_1_Ks,loc_I*loc_J*loc_K);
+        norm_yiter += calc_l2norm(wks_1_Keta,loc_I*loc_J*loc_K);
+        norm_yiter += calc_l2norm(wks_1_Kxi,loc_I*loc_J*loc_K);
+
+        CUSTOMREAL tmp = norm_yiter;
+        allreduce_cr_single(tmp, norm_yiter);
+
+        for (int iinv = imax; iinv >= imin; iinv--){
+
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        wks_1_Ks[I2V(i,j,k)]   = grid.Ks_grad_store_loc[  I2VLBFGS(iinv+1,i,j,k)]   - grid.Ks_grad_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_1_Keta[I2V(i,j,k)] = grid.Keta_grad_store_loc[I2VLBFGS(iinv+1,i,j,k)] - grid.Keta_grad_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_1_Kxi[I2V(i,j,k)]  = grid.Kxi_grad_store_loc[ I2VLBFGS(iinv+1,i,j,k)]  - grid.Kxi_grad_store_loc[I2VLBFGS(iinv,i,j,k)];
+
+                        wks_2_Ks[I2V(i,j,k)]   = grid.Ks_model_store_loc[  I2VLBFGS(iinv+1,i,j,k)]   - grid.Ks_model_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_2_Keta[I2V(i,j,k)] = grid.Keta_model_store_loc[I2VLBFGS(iinv+1,i,j,k)] - grid.Keta_model_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_2_Kxi[I2V(i,j,k)]  = grid.Kxi_model_store_loc[ I2VLBFGS(iinv+1,i,j,k)]  - grid.Kxi_model_store_loc[I2VLBFGS(iinv,i,j,k)];
+                    }
+                }
+            }
+
+            CUSTOMREAL pk = _0_CR;
+            pk += dot_product(wks_1_Ks,wks_2_Ks,loc_I*loc_J*loc_K);
+            pk += dot_product(wks_1_Keta,wks_2_Keta,loc_I*loc_J*loc_K);
+            pk += dot_product(wks_1_Kxi,wks_2_Kxi,loc_I*loc_J*loc_K);
+            CUSTOMREAL tmp = pk;
+            allreduce_cr_single(tmp, pk);
+            pk_store[iinv] = _1_CR / pk;
+
+            CUSTOMREAL ak = _0_CR;
+            ak += dot_product(wks_2_Ks, desc_wks_Ks,loc_I*loc_J*loc_K);
+            ak += dot_product(wks_2_Keta, desc_wks_Keta,loc_I*loc_J*loc_K);
+            ak += dot_product(wks_2_Kxi, desc_wks_Kxi,loc_I*loc_J*loc_K);
+            // print ak
+            if(myrank== 0) std::cout << "ak: " << ak << std::endl;
+            tmp = ak;
+            allreduce_cr_single(tmp, ak);
+            // print ak
+            if(myrank== 0) std::cout << "ak gathered: " << ak << std::endl;
+            ak_store[iinv] = pk_store[iinv] * ak;
+
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        desc_wks_Ks[I2V(i,j,k)]   -= ak_store[iinv] * wks_1_Ks[I2V(i,j,k)];
+                        desc_wks_Keta[I2V(i,j,k)] -= ak_store[iinv] * wks_1_Keta[I2V(i,j,k)];
+                        desc_wks_Kxi[I2V(i,j,k)]  -= ak_store[iinv] * wks_1_Kxi[I2V(i,j,k)];
+                    }
+                }
+            }
+
+        } // end loop iinv
+
+        // Nocedal's default preconditionning
+        CUSTOMREAL pk = _1_CR / (pk_store[imax] * norm_yiter);
+        for (int k = 0; k < loc_K; k++) {
+            for (int j = 0; j < loc_J; j++) {
+                for (int i = 0; i < loc_I; i++) {
+                    desc_wks_Ks[I2V(i,j,k)]   *= pk;
+                    desc_wks_Keta[I2V(i,j,k)] *= pk;
+                    desc_wks_Kxi[I2V(i,j,k)]  *= pk;
+                }
+            }
+        }
+
+        // print ak_store and pk_store
+        if (myrank == 0) {
+            std::cout << "ak_store: ";
+            for (int i = 0; i < imax+1; i++) {
+                std::cout << ak_store[i] << " ";
+            }
+            std::cout << std::endl;
+            std::cout << "pk_store: ";
+            for (int i = 0; i < imax+1; i++) {
+                std::cout << pk_store[i] << " ";
+            }
+            std::cout << std::endl;
+        }
+
+
+        // custom preconditionning
+        // diagonal preconditionner
+
+        for (int iinv = imin; iinv <= imax; iinv++){
+
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        wks_1_Ks[I2V(i,j,k)]   = grid.Ks_grad_store_loc[  I2VLBFGS(iinv+1,i,j,k)]   - grid.Ks_grad_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_1_Keta[I2V(i,j,k)] = grid.Keta_grad_store_loc[I2VLBFGS(iinv+1,i,j,k)] - grid.Keta_grad_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_1_Kxi[I2V(i,j,k)]  = grid.Kxi_grad_store_loc[ I2VLBFGS(iinv+1,i,j,k)]  - grid.Kxi_grad_store_loc[I2VLBFGS(iinv,i,j,k)];
+
+                        wks_2_Ks[I2V(i,j,k)]   = grid.Ks_model_store_loc[  I2VLBFGS(iinv+1,i,j,k)]   - grid.Ks_model_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_2_Keta[I2V(i,j,k)] = grid.Keta_model_store_loc[I2VLBFGS(iinv+1,i,j,k)] - grid.Keta_model_store_loc[I2VLBFGS(iinv,i,j,k)];
+                        wks_2_Kxi[I2V(i,j,k)]  = grid.Kxi_model_store_loc[ I2VLBFGS(iinv+1,i,j,k)]  - grid.Kxi_model_store_loc[I2VLBFGS(iinv,i,j,k)];
+                    }
+                }
+            }
+
+            CUSTOMREAL beta = _0_CR;
+            beta += dot_product(wks_1_Ks,desc_wks_Ks,loc_I*loc_J*loc_K);
+            beta += dot_product(wks_1_Keta,desc_wks_Keta,loc_I*loc_J*loc_K);
+            beta += dot_product(wks_1_Kxi,desc_wks_Kxi,loc_I*loc_J*loc_K);
+            CUSTOMREAL tmp = beta;
+            allreduce_cr_single(tmp, beta);
+            beta = pk_store[iinv] * beta;
+
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        desc_wks_Ks[I2V(i,j,k)]   += (ak_store[iinv]-beta) * wks_2_Ks[I2V(i,j,k)];
+                        desc_wks_Keta[I2V(i,j,k)] += (ak_store[iinv]-beta) * wks_2_Keta[I2V(i,j,k)];
+                        desc_wks_Kxi[I2V(i,j,k)]  += (ak_store[iinv]-beta) * wks_2_Kxi[I2V(i,j,k)];
+                    }
+                }
+            }
+        }
+
+        // descent directions
+        for (int k = 0; k < loc_K; k++) {
+            for (int j = 0; j < loc_J; j++) {
+                for (int i = 0; i < loc_I; i++) {
+                    grid.Ks_descent_dir_loc[I2V(i,j,k)]   = - _1_CR * desc_wks_Ks[I2V(i,j,k)];
+                    grid.Keta_descent_dir_loc[I2V(i,j,k)] = - _1_CR * desc_wks_Keta[I2V(i,j,k)];
+                    grid.Kxi_descent_dir_loc[I2V(i,j,k)]  = - _1_CR * desc_wks_Kxi[I2V(i,j,k)];
+                    //grid.Ks_descent_dir_loc[I2V(i,j,k)]   = desc_wks_Ks[I2V(i,j,k)];
+                    //grid.Keta_descent_dir_loc[I2V(i,j,k)] = desc_wks_Keta[I2V(i,j,k)];
+                    //grid.Kxi_descent_dir_loc[I2V(i,j,k)]  = desc_wks_Kxi[I2V(i,j,k)];
+
+                }
+            }
+        }
+
+
+        delete [] ak_store;
+        delete [] pk_store;
+        delete [] desc_wks_Ks;
+        delete [] desc_wks_Keta;
+        delete [] desc_wks_Kxi;
+        delete [] wks_1_Ks;
+        delete [] wks_1_Keta;
+        delete [] wks_1_Kxi;
+        delete [] wks_2_Ks;
+        delete [] wks_2_Keta;
+        delete [] wks_2_Kxi;
+
+
+    } // end of subdom_main
+
+    // share with all simultaneous run
+    if (subdom_main){
+        broadcast_cr_inter_sim(grid.Ks_descent_dir_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Keta_descent_dir_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Kxi_descent_dir_loc, loc_I*loc_J*loc_K, 0);
+    }
+
+}
+
+/*
+// laplacian approximation in cartesian coordinates
+inline void calc_laplacian_field(Grid& grid, CUSTOMREAL* arr_in, CUSTOMREAL* arr_res) {
+    if (subdom_main) {
+
+        //CUSTOMREAL lr = 1.0;
+        //CUSTOMREAL lt = 1.0;
+        //CUSTOMREAL lp = 1.0;
+
+        CUSTOMREAL lr = regul_lp;
+        CUSTOMREAL lt = regul_lt;
+        CUSTOMREAL lp = regul_lp;
+
+        // calculate L(m)
+        for (int k = 1; k < loc_K-1; k++) {
+            for (int j = 1; j < loc_J-1; j++) {
+                for (int i = 1; i < loc_I-1; i++) {
+                    arr_res[I2V(i,j,k)] = \
+                     - _2_CR * (lr+lt+lp) * arr_in[I2V(i,j,k)] \
+                        + lr * (arr_in[I2V(i,j,k+1)] + arr_in[I2V(i,j,k-1)]) \
+                        + lt * (arr_in[I2V(i,j+1,k)] + arr_in[I2V(i,j-1,k)]) \
+                        + lp * (arr_in[I2V(i+1,j,k)] + arr_in[I2V(i-1,j,k)]);
+                }
+            }
+        }
+
+        // communicate with adjacent subdomains
+        grid.send_recev_boundary_data(arr_res);
+    }
+}
+*/
+
+// laplacian approximation in spherical coordinates
+inline void calc_laplacian_field(Grid& grid, CUSTOMREAL* d, CUSTOMREAL* arr_out){
+
+    if (subdom_main) {
+
+        CUSTOMREAL lr = regul_lp;
+        CUSTOMREAL lt = regul_lt;
+        CUSTOMREAL lp = regul_lp;
+        CUSTOMREAL dr = grid.dr;
+        CUSTOMREAL dt = grid.dt;
+        CUSTOMREAL dp = grid.dp;
+
+        CUSTOMREAL r,t;
+
+        // calculate L(m) in sphercal coordinates
+        for (int k = 1; k < loc_K-1; k++) {
+            for (int j = 1; j < loc_J-1; j++) {
+                for (int i = 1; i < loc_I-1; i++) {
+                    r = grid.r_loc_1d[k];
+                    t = grid.t_loc_1d[j];
+
+                    // finite difference approximation of laplacian spherical coordinates
+                    arr_out[I2V(i,j,k)] = \
+                        lr*lr*((d[I2V(i,j,k+1)] - _2_CR*d[I2V(i,j,k)] + d[I2V(i,j,k-1)])/(dr*dr) \
+                           +(_2_CR/r)*(d[I2V(i,j,k+1)]-d[I2V(i,j,k-1)])/dr) \
+                      + lt*lt*((_1_CR/(r*r*std::sin(t)))*(std::cos(t)*(d[I2V(i,j+1,k)]-d[I2V(i,j-1,k)])/dt \
+                        - (std::sin(t)*(d[I2V(i,j+1,k)]-_2_CR*d[I2V(i,j,k)]+d[I2V(i,j-1,k)])/(dt*dt)))) \
+                      + lp*lp*((_1_CR/(r*r*std::sin(t)*std::sin(t)))*(d[I2V(i+1,j,k)]-_2_CR*d[I2V(i,j,k)]+d[I2V(i-1,j,k)])/(dp*dp));
+
+                }
+            }
+        }
+
+        // communicate with adjacent subdomains
+        grid.send_recev_boundary_data(arr_out);
+    }
+}
+
+
+
+
+// add 1/2 * L(m)^2 to the objective function
+inline CUSTOMREAL calculate_regularization_obj(Grid& grid) {
+
+    CUSTOMREAL regularization_term = _0_CR;
+
+    if (subdom_main && id_sim==0) {
+        int ngrid = loc_I*loc_J*loc_K;
+
+        regularization_term += weight_Tikonov     * calc_l2norm(grid.fun_regularization_penalty_loc, ngrid);
+        regularization_term += weight_Tikonov_ani * calc_l2norm(grid.eta_regularization_penalty_loc, ngrid);
+        regularization_term += weight_Tikonov_ani * calc_l2norm(grid.xi_regularization_penalty_loc, ngrid);
+
+        // gather from all subdomain
+        CUSTOMREAL tmp = regularization_term;
+        allreduce_cr_single(tmp, regularization_term);
+    }
+
+    // share retularization_term with all simultaneous run (may be unnecessary)
+    broadcast_cr_single_inter_sim(regularization_term, 0);
+
+    return _0_5_CR * regularization_term;
+
+}
+
+
+inline void init_regularization_penalty(Grid& grid) {
+    if (subdom_main && id_sim==0) {
+        int ngrid = loc_I*loc_J*loc_K;
+
+        // initialize regularization penalties
+        std::fill(grid.fun_regularization_penalty_loc, grid.fun_regularization_penalty_loc + ngrid, _0_CR);
+        std::fill(grid.eta_regularization_penalty_loc, grid.eta_regularization_penalty_loc + ngrid, _0_CR);
+        std::fill(grid.xi_regularization_penalty_loc, grid.xi_regularization_penalty_loc + ngrid, _0_CR);
+        std::fill(grid.fun_gradient_regularization_penalty_loc, grid.fun_gradient_regularization_penalty_loc + ngrid, _0_CR);
+        std::fill(grid.eta_gradient_regularization_penalty_loc, grid.eta_gradient_regularization_penalty_loc + ngrid, _0_CR);
+        std::fill(grid.xi_gradient_regularization_penalty_loc, grid.xi_gradient_regularization_penalty_loc + ngrid, _0_CR);
+    }
+}
+
+inline void init_regulaization_penalty_with_one(Grid& grid) {
+    if (subdom_main && id_sim==0) {
+        int ngrid = loc_I*loc_J*loc_K;
+
+        // initialize regularization penalties
+        std::fill(grid.fun_regularization_penalty_loc, grid.fun_regularization_penalty_loc + ngrid, _1_CR);
+        std::fill(grid.eta_regularization_penalty_loc, grid.eta_regularization_penalty_loc + ngrid, _1_CR);
+        std::fill(grid.xi_regularization_penalty_loc, grid.xi_regularization_penalty_loc + ngrid, _1_CR);
+        std::fill(grid.fun_gradient_regularization_penalty_loc, grid.fun_gradient_regularization_penalty_loc + ngrid, _1_CR);
+        std::fill(grid.eta_gradient_regularization_penalty_loc, grid.eta_gradient_regularization_penalty_loc + ngrid, _1_CR);
+        std::fill(grid.xi_gradient_regularization_penalty_loc, grid.xi_gradient_regularization_penalty_loc + ngrid, _1_CR);
+    }
+}
+
+
+inline void calculate_regularization_penalty(Grid& grid) {
+    if (subdom_main && id_sim==0) {
+        int n_grid = loc_I*loc_J*loc_K;
+
+        CUSTOMREAL* tmp_fun = allocateMemory<CUSTOMREAL>(n_grid, 2011);
+        CUSTOMREAL* tmp_eta = allocateMemory<CUSTOMREAL>(n_grid, 2012);
+        CUSTOMREAL* tmp_xi  = allocateMemory<CUSTOMREAL>(n_grid, 2013);
+
+        for (int i = 0; i < n_grid; i++) {
+            tmp_fun[i] = grid.fun_loc[i] - grid.fun_prior_loc[i];
+            tmp_eta[i] = grid.eta_loc[i] - grid.eta_prior_loc[i];
+            tmp_xi[i]  = grid.xi_loc[i]  - grid.xi_prior_loc[i];
+        }
+
+        init_regularization_penalty(grid);
+
+        // calculate LL(m) on fun (Ks)
+        calc_laplacian_field(grid, tmp_fun, grid.fun_regularization_penalty_loc);
+        calc_laplacian_field(grid, grid.fun_regularization_penalty_loc, grid.fun_gradient_regularization_penalty_loc);
+
+        // calculate LL(m) on eta (Keta)
+        calc_laplacian_field(grid, tmp_eta, grid.eta_regularization_penalty_loc);
+        calc_laplacian_field(grid, grid.eta_regularization_penalty_loc, grid.eta_gradient_regularization_penalty_loc);
+
+        // calculate LL(m) on xi (Kxi)
+        calc_laplacian_field(grid, tmp_xi, grid.xi_regularization_penalty_loc);
+        calc_laplacian_field(grid, grid.xi_regularization_penalty_loc, grid.xi_gradient_regularization_penalty_loc);
+
+        for (int i = 0; i < n_grid; i++){
+            // calculate gradient_regularization_penalty first for avoiding using overwrited value in regularization_penalty
+            grid.fun_gradient_regularization_penalty_loc[i] = damp_weight_fun*damp_weight_fun*(tmp_fun[i] - _2_CR * grid.fun_regularization_penalty_loc[i] + grid.fun_gradient_regularization_penalty_loc[i]);
+            grid.eta_gradient_regularization_penalty_loc[i] = damp_weight_eta*damp_weight_eta*(tmp_eta[i] - _2_CR * grid.eta_regularization_penalty_loc[i] + grid.eta_gradient_regularization_penalty_loc[i]);
+            grid.xi_gradient_regularization_penalty_loc[i]  = damp_weight_xi *damp_weight_xi *(tmp_xi[i]  - _2_CR * grid.xi_regularization_penalty_loc[i]  + grid.xi_gradient_regularization_penalty_loc[i]);
+        }
+
+        grid.send_recev_boundary_data(grid.fun_gradient_regularization_penalty_loc);
+        grid.send_recev_boundary_data(grid.eta_gradient_regularization_penalty_loc);
+        grid.send_recev_boundary_data(grid.xi_gradient_regularization_penalty_loc);
+
+        for (int i = 0; i < n_grid; i++){
+            // calculate gradient_regularization_penalty first for avoiding using overwrited value in regularization_penalty
+
+            grid.fun_regularization_penalty_loc[i] = damp_weight_fun*(tmp_fun[i] - grid.fun_regularization_penalty_loc[i]);
+            grid.eta_regularization_penalty_loc[i] = damp_weight_eta*(tmp_eta[i] - grid.eta_regularization_penalty_loc[i]);
+            grid.xi_regularization_penalty_loc[i]  = damp_weight_xi *(tmp_xi[i]  - grid.xi_regularization_penalty_loc[i]);
+        }
+
+        grid.send_recev_boundary_data(grid.fun_regularization_penalty_loc);
+        grid.send_recev_boundary_data(grid.eta_regularization_penalty_loc);
+        grid.send_recev_boundary_data(grid.xi_regularization_penalty_loc);
+
+        delete [] tmp_fun;
+        delete [] tmp_eta;
+        delete [] tmp_xi;
+    }
+}
+
+
+// add grad(L(m)) to gradient
+inline void add_regularization_grad(Grid& grid) {
+    if (subdom_main){
+        if(id_sim==0){
+            // add LL(m) to gradient
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+
+                        //grid.fun_gradient_regularization_penalty_loc[I2V(i,j,k)]   /= Linf_Ks;
+                        //grid.eta_gradient_regularization_penalty_loc[I2V(i,j,k)] /= Linf_Keta;
+                        //grid.xi_gradient_regularization_penalty_loc[I2V(i,j,k)]  /= Linf_Kxi;
+
+                        grid.Ks_update_loc[I2V(i,j,k)]   += weight_Tikonov     * grid.fun_gradient_regularization_penalty_loc[I2V(i,j,k)];
+                        grid.Keta_update_loc[I2V(i,j,k)] += weight_Tikonov_ani * grid.eta_gradient_regularization_penalty_loc[I2V(i,j,k)];
+                        grid.Kxi_update_loc[I2V(i,j,k)]  += weight_Tikonov_ani * grid.xi_gradient_regularization_penalty_loc[I2V(i,j,k)];
+                    }
+                }
+            }
+
+            grid.send_recev_boundary_data(grid.Ks_update_loc);
+            grid.send_recev_boundary_data(grid.Keta_update_loc);
+            grid.send_recev_boundary_data(grid.Kxi_update_loc);
+
+        }// end id_sim==0
+
+
+        // share with all simultaneous run (may be unnecessary)
+        broadcast_cr_inter_sim(grid.Ks_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Keta_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Kxi_update_loc, loc_I*loc_J*loc_K, 0);
+
+    }// end subdom_main
+}
+
+
+// compute initial guess for step length to try for line search
+void initial_guess_step(Grid& grid, CUSTOMREAL& step_length, CUSTOMREAL SC_VAL) {
+    // find the max value in descent_direction
+    CUSTOMREAL max_val = _0_CR, max_val_all = _0_CR;
+    const CUSTOMREAL max_relative_pert = 0.02;
+
+    if(subdom_main && id_sim==0) {
+        for (int k = 0; k < loc_K; k++) {
+            for (int j = 0; j < loc_J; j++) {
+                for (int i = 0; i < loc_I; i++) {
+                    if (std::abs(grid.Ks_descent_dir_loc[I2V(i,j,k)]  ) > max_val) max_val = std::abs(grid.Ks_descent_dir_loc[I2V(i,j,k)]);
+                    if (std::abs(grid.Keta_descent_dir_loc[I2V(i,j,k)]) > max_val) max_val = std::abs(grid.Keta_descent_dir_loc[I2V(i,j,k)]);
+                    if (std::abs(grid.Kxi_descent_dir_loc[I2V(i,j,k)] ) > max_val) max_val = std::abs(grid.Kxi_descent_dir_loc[I2V(i,j,k)]);
+                }
+            }
+        }
+
+        // reduce
+        allreduce_cr_single_max(max_val, max_val_all);
+
+        // debug print max_val
+        std::cout << "max_val: " << max_val_all << std::endl;
+
+        // step size
+        step_length = max_relative_pert / SC_VAL / max_val_all;
+    }
+
+    // broadcast to all simultaneous run (may be unnecessary)
+    if (subdom_main)
+        broadcast_cr_single_inter_sim(step_length, 0);
+
+}
+
+
+inline CUSTOMREAL compute_volume_domain(Grid& grid) {
+    CUSTOMREAL volume = _0_CR;
+
+    if (subdom_main && id_sim==0) {
+        volume += calc_l2norm(grid.fun_regularization_penalty_loc, loc_I*loc_J*loc_K);
+        volume += calc_l2norm(grid.eta_regularization_penalty_loc, loc_I*loc_J*loc_K);
+        volume += calc_l2norm(grid.xi_regularization_penalty_loc, loc_I*loc_J*loc_K);
+
+        // reduce
+        allreduce_cr_single(volume, volume);
+    }
+
+    // broadcast to all simultaneous run (may be unnecessary)
+    broadcast_cr_single_inter_sim(volume, 0);
+
+    return volume;
+}
+
+
+inline void compute_damp_weights(const Grid& grid) {
+    // (size/sum)**2
+
+    if(subdom_main){
+        CUSTOMREAL sum_fun = std::accumulate(grid.fun_prior_loc, grid.fun_prior_loc + loc_I*loc_J*loc_K, _0_CR);
+        //CUSTOMREAL sum_eta = std::accumulate(grid.eta_prior_loc, grid.eta_prior_loc + loc_I*loc_J*loc_K, _0_CR); // do not use
+        //CUSTOMREAL sum_xi  = std::accumulate(grid.xi_prior_loc, grid.xi_prior_loc + loc_I*loc_J*loc_K, _0_CR);
+
+        allreduce_cr_single(sum_fun, sum_fun);
+
+        damp_weight_fun = (volume_domain / sum_fun)*(volume_domain / sum_fun);
+        //damp_weight_eta = (volume / sum_eta)*(volume / sum_eta);
+        //damp_weight_xi  = (volume / sum_xi)*(volume / sum_xi);
+        damp_weight_eta = _1_CR;
+        damp_weight_xi  = _1_CR;
+
+    }
+
+}
+
+#endif
--- a/include/main_routines_calling.h
+++ b/include/main_routines_calling.h
@@ -0,0 +1,923 @@
+#ifndef MAIN_ROUTINES_CALLING_H
+#define MAIN_ROUTINES_CALLING_H
+
+#include <iostream>
+#include <memory>
+#include "mpi_funcs.h"
+#include "config.h"
+#include "utils.h"
+#include "input_params.h"
+#include "grid.h"
+#include "io.h"
+#include "main_routines_inversion_mode.h"
+#include "model_optimization_routines.h"
+#include "main_routines_earthquake_relocation.h"
+#include "iterator_selector.h"
+#include "iterator.h"
+#include "iterator_legacy.h"
+#include "iterator_level.h"
+#include "source.h"
+#include "receiver.h"
+#include "kernel.h"
+#include "model_update.h"
+#include "lbfgs.h"
+#include "objective_function_utils.h"
+#include "timer.h"
+#include "oneD_inversion.h"
+
+// run forward-only or inversion mode
+inline void run_forward_only_or_inversion(InputParams &IP, Grid &grid, IO_utils &io) {
+
+    // for check if the current source is the first source
+    bool first_src = true;
+
+    if(myrank == 0)
+        std::cout << "id_sim: " << id_sim << ", size of src_map: " << IP.src_map.size() << std::endl;
+
+    // estimate running time
+    Timer timer("Forward_or_inversion", true);
+
+    // prepare objective_function file
+    std::ofstream out_main; // close() is not mandatory
+    prepare_header_line(IP, out_main);
+
+    if (id_sim==0) {
+        io.prepare_grid_inv_xdmf(0);
+
+        //io.change_xdmf_obj(0); // change xmf file for next src
+        io.change_group_name_for_model();
+
+        // write out model info
+        io.write_vel(grid, 0);
+        io.write_xi( grid, 0);
+        io.write_eta(grid, 0);
+        //io.write_zeta(grid, i_inv); // TODO
+
+        if (IP.get_verbose_output_level()){
+            io.write_a(grid,   0);
+            io.write_b(grid,   0);
+            io.write_c(grid,   0);
+            io.write_f(grid,   0);
+            io.write_fun(grid, 0);
+        }
+
+        // // output model_parameters_inv_0000.dat
+        // if (IP.get_if_output_model_dat())
+        //     io.write_concerning_parameters(grid, 0, IP);
+    }
+
+
+    // output station correction file (only for teleseismic differential data)
+    IP.write_station_correction_file(0);
+
+    synchronize_all_world();
+
+    /////////////////////
+    // loop for inversion
+    /////////////////////
+
+    bool line_search_mode = false; // if true, run_simulation_one_step skips adjoint simulation and only calculates objective function value
+
+    // objective function for all src
+    CUSTOMREAL v_obj = 0.0, old_v_obj = 0.0;
+    std::vector<CUSTOMREAL> v_obj_misfit(20, 0.0);
+
+    for (int i_inv = 0; i_inv < IP.get_max_iter_inv(); i_inv++) {
+
+        if(myrank == 0 && id_sim ==0){
+            std::cout << "iteration " << i_inv << " starting ... " << std::endl;
+        }
+
+        old_v_obj = v_obj;
+
+        // prepare inverstion iteration group in xdmf file
+        io.prepare_grid_inv_xdmf(i_inv);
+
+        ///////////////////////////////////////////////////////
+        // run (forward and adjoint) simulation for each source
+        ///////////////////////////////////////////////////////
+
+        // run forward and adjoint simulation and calculate current objective function value and sensitivity kernel for all sources
+        line_search_mode = false;
+        // skip for the mode with sub-iteration
+        if (i_inv > 0 && optim_method != GRADIENT_DESCENT) {
+        } else {
+            bool is_save_T = false;
+            v_obj_misfit = run_simulation_one_step(IP, grid, io, i_inv, first_src, line_search_mode, is_save_T);
+            v_obj = v_obj_misfit[0];
+        }
+
+        // wait for all processes to finish
+        synchronize_all_world();
+
+        // check if v_obj is nan
+        if (std::isnan(v_obj)) {
+            if (myrank == 0)
+                std::cout << "v_obj is nan, stop inversion" << std::endl;
+            // stop inversion
+            break;
+        }
+
+        // output src rec file with the result arrival times
+        if (IP.get_if_output_in_process_data() || i_inv == IP.get_max_iter_inv()-1 || i_inv==0) {
+            IP.write_src_rec_file(i_inv,0);
+        }
+
+        ///////////////
+        // model update
+        ///////////////
+        if(myrank == 0 && id_sim ==0)
+            std::cout << "model update starting ... " << std::endl;
+
+        if (IP.get_run_mode() == DO_INVERSION) {
+            if (optim_method == GRADIENT_DESCENT)
+                model_optimize(IP, grid, io, i_inv, v_obj, old_v_obj, first_src, out_main);
+            else if (optim_method == HALVE_STEPPING_MODE)
+                v_obj_misfit = model_optimize_halve_stepping(IP, grid, io, i_inv, v_obj, first_src, out_main);
+            else if (optim_method == LBFGS_MODE) {
+                bool found_next_step = model_optimize_lbfgs(IP, grid, io, i_inv, v_obj, first_src, out_main);
+                if (!found_next_step)
+                    goto end_of_inversion;
+            }
+        }
+
+        // output station correction file (only for teleseismic differential data)
+        IP.write_station_correction_file(i_inv + 1);
+
+        // output objective function
+        write_objective_function(IP, i_inv, v_obj_misfit, out_main, "model update");
+
+        // since model is update. The written traveltime field should be discraded.
+        // initialize is_T_written_into_file
+        for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+            const std::string name_sim_src = IP.get_src_name(i_src);
+
+            if (proc_store_srcrec) // only proc_store_srcrec has the src_map object
+                IP.src_map[name_sim_src].is_T_written_into_file = false;
+        }
+
+        // output updated model
+        if (id_sim==0) {
+            //io.change_xdmf_obj(0); // change xmf file for next src
+            io.change_group_name_for_model();
+
+            // write out model info
+            if (IP.get_if_output_in_process() || i_inv >= IP.get_max_iter_inv() - 2){
+                io.write_vel(grid, i_inv+1);
+                io.write_xi( grid, i_inv+1);
+                io.write_eta(grid, i_inv+1);
+            }
+            //io.write_zeta(grid, i_inv); // TODO
+
+            if (IP.get_verbose_output_level()){
+                io.write_a(grid,   i_inv+1);
+                io.write_b(grid,   i_inv+1);
+                io.write_c(grid,   i_inv+1);
+                io.write_f(grid,   i_inv+1);
+                io.write_fun(grid, i_inv+1);
+            }
+
+            // // output model_parameters_inv_0000.dat
+            // if (IP.get_if_output_model_dat()
+            // && (IP.get_if_output_in_process() || i_inv >= IP.get_max_iter_inv() - 2))
+            //     io.write_concerning_parameters(grid, i_inv + 1, IP);
+
+        }
+
+        // writeout temporary xdmf file
+        io.update_xdmf_file();
+
+        // wait for all processes to finish
+        synchronize_all_world();
+
+
+        // estimate running time
+        CUSTOMREAL time_elapsed = timer.get_t();
+        if (id_sim == 0 && myrank == 0 && i_inv < IP.get_max_iter_inv()-1) {
+            const time_t end_time_estimated = time_elapsed / (i_inv + 1) * (IP.get_max_iter_inv() - i_inv - 1) + timer.get_start();
+            auto will_run_time = (int)(time_elapsed/(i_inv + 1) * (IP.get_max_iter_inv() - i_inv - 1));
+
+            std::cout << std::endl;
+            std::cout << "The program begins at " << timer.get_start_t() << std::endl;
+            std::cout << "Iteration (" << i_inv + 1 << "/" << IP.get_max_iter_inv() << ") finished at " << time_elapsed << " seconds" << std::endl;
+            std::cout << i_inv + 1 << " iterations run " << timer.get_t() << " seconds, the rest of " << IP.get_max_iter_inv() - i_inv - 1 << " iterations require " << will_run_time << " seconds." << std::endl;
+            std::cout << "The program is estimated to stop at " << timer.get_utc_from_time_t(end_time_estimated) << std::endl;
+            std::cout << std::endl;
+        }
+
+        // output current state of the model
+        if (IP.get_if_output_final_model()) {
+            // io.write_final_model(grid, IP);
+            io.write_merged_model(grid, IP, "final_model.h5");
+        }
+        if (IP.get_if_output_middle_model()) {
+            std::string tmp_fname = "middle_model_step_" + int2string_zero_fill(i_inv + 1) + ".h5";
+            io.write_merged_model(grid, IP, tmp_fname);
+        }
+
+
+    } // end loop inverse
+
+end_of_inversion:
+
+    // close xdmf file
+    io.finalize_data_output_file();
+
+    timer.stop_timer();
+    if (id_sim == 0 && myrank == 0) {
+        std::cout << std::endl;
+        std::cout << "The program begin at " << timer.get_start_t() << std::endl;
+        std::cout << "Forward_or_inversion end at " << timer.get_end_t() << std::endl;
+        std::cout << "It has run " << timer.get_elapsed_t() << " seconds in total." << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+
+// run earthquake relocation mode
+inline void run_earthquake_relocation(InputParams& IP, Grid& grid, IO_utils& io) {
+
+    Receiver recs;
+
+    // calculate traveltime for each receiver (swapped from source) and write in output file
+    calculate_traveltime_for_all_src_rec(IP, grid, io);
+
+    // prepare output for iteration status
+    std::ofstream out_main; // close() is not mandatory
+    prepare_header_line(IP, out_main);
+
+    // objective function and its gradient
+    CUSTOMREAL v_obj      = 999999999.0;
+
+    int        i_iter     = 0;
+
+    std::vector<CUSTOMREAL> v_obj_misfit;
+
+    // iterate
+    while (true) {
+
+        v_obj      = 0.0;
+
+        // calculate gradient of objective function at sources
+        v_obj_misfit = calculate_gradient_objective_function(IP, grid, io, i_iter);
+        v_obj = v_obj_misfit[0];
+
+        // update source location
+        recs.update_source_location(IP, grid);
+
+        synchronize_all_world();
+
+        // check convergence
+        bool finished = false;
+
+        if (subdom_main && id_subdomain==0) {
+            if (i_iter >= N_ITER_MAX_SRC_RELOC){
+                std::cout << "Finished relocation because iteration number exceeds the maximum " << N_ITER_MAX_SRC_RELOC << std::endl;
+                finished = true;
+            }
+            allreduce_bool_single_inplace_sim(finished); //LAND
+        }
+
+        synchronize_all_world();
+
+        // check if all processes have finished
+        broadcast_bool_inter_and_intra_sim(finished, 0);
+
+        synchronize_all_world();
+
+        // output location information
+        if(id_sim == 0 && myrank == 0){
+            // write objective function
+            std::cout << "iteration: " << i_iter << ", objective function: "              << v_obj << std::endl;
+        }
+
+        // write objective functions
+        write_objective_function(IP, i_iter, v_obj_misfit, out_main, "relocation");
+
+        // write out new src_rec_file
+        if (IP.get_if_output_in_process_data() || i_iter == N_ITER_MAX_SRC_RELOC-1 || i_iter==0){
+            IP.write_src_rec_file(0,i_iter);
+        }
+
+        // modify the receiver's location for output
+        IP.modify_swapped_source_location();
+
+
+        if (finished)
+            break;
+
+        // new iteration
+        i_iter++;
+    }
+
+    // modify the receiver's location
+    IP.modify_swapped_source_location();
+    // write out new src_rec_file
+    IP.write_src_rec_file(0,i_iter);
+    // close xdmf file
+    io.finalize_data_output_file();
+
+}
+
+
+// run earthquake relocation mode
+inline void run_inversion_and_relocation(InputParams& IP, Grid& grid, IO_utils& io) {
+
+    Timer timer("Inv_and_reloc", true);
+
+    /////////////////////
+    // preparation of model update
+    /////////////////////
+
+    // for check if the current source is the first source
+    bool first_src = true;
+
+    if(myrank == 0)
+        std::cout << "id_sim: " << id_sim << ", size of src_map: " << IP.src_map.size() << std::endl;
+
+    // prepare objective_function file
+    std::ofstream out_main; // close() is not mandatory
+    prepare_header_line(IP, out_main);
+
+    if (id_sim==0) {
+        // prepare inverstion iteration group in xdmf file
+        io.prepare_grid_inv_xdmf(0);
+
+        //io.change_xdmf_obj(0); // change xmf file for next src
+        io.change_group_name_for_model();
+
+        // write out model info
+        io.write_vel(grid, 0);
+        io.write_xi( grid, 0);
+        io.write_eta(grid, 0);
+        //io.write_zeta(grid, i_inv); // TODO
+
+        if (IP.get_verbose_output_level()){
+            io.write_a(grid,   0);
+            io.write_b(grid,   0);
+            io.write_c(grid,   0);
+            io.write_f(grid,   0);
+            io.write_fun(grid, 0);
+        }
+
+        // // output model_parameters_inv_0000.dat
+        // if (IP.get_if_output_model_dat())
+        //     io.write_concerning_parameters(grid, 0, IP);
+    }
+
+    // output station correction file (only for teleseismic differential data)
+    IP.write_station_correction_file(0);
+
+    synchronize_all_world();
+
+    bool line_search_mode = false; // if true, run_simulation_one_step skips adjoint simulation and only calculates objective function value
+
+    /////////////////////
+    // preparation of relocation
+    /////////////////////
+
+    Receiver recs;
+
+    /////////////////////
+    // initializae objective function
+    /////////////////////
+
+    CUSTOMREAL v_obj = 0.0,         old_v_obj = 10000000000.0;
+    std::vector<CUSTOMREAL> v_obj_misfit(20, 0.0);
+
+    if (inv_mode == ITERATIVE){
+
+        /////////////////////
+        // main loop for model update and relocation
+        /////////////////////
+
+        int model_update_step = 0;
+        int relocation_step = 0;
+
+        for (int i_loop = 0; i_loop < IP.get_max_loop_mode0(); i_loop++){
+            /////////////////////
+            // stage 1, update model parameters
+            /////////////////////
+
+            for (int one_loop_i_inv = 0; one_loop_i_inv < IP.get_model_update_N_iter(); one_loop_i_inv++){
+                // the actural step index of model update
+                int i_inv = i_loop * IP.get_model_update_N_iter() + one_loop_i_inv;
+
+                if(myrank == 0 && id_sim ==0){
+                    std::cout   << std::endl;
+                    std::cout   << "loop " << i_loop+1 << ", model update iteration " << one_loop_i_inv+1
+                                << " ( the " << i_inv+1 << "-th model update) starting ... " << std::endl;
+                    std::cout   << std::endl;
+                }
+
+
+                // prepare inverstion iteration group in xdmf file
+                io.prepare_grid_inv_xdmf(i_inv);
+
+                ///////////////////////////////////////////////////////
+                // run (forward and adjoint) simulation for each source
+                ///////////////////////////////////////////////////////
+
+                // run forward and adjoint simulation and calculate current objective function value and sensitivity kernel for all sources
+                line_search_mode = false;
+                // skip for the mode with sub-iteration
+                if (i_inv > 0 && optim_method != GRADIENT_DESCENT) {
+                } else {
+                    bool is_save_T = false;
+                    v_obj_misfit = run_simulation_one_step(IP, grid, io, i_inv, first_src, line_search_mode, is_save_T);
+                    v_obj = v_obj_misfit[0];
+                }
+
+                // wait for all processes to finish
+                synchronize_all_world();
+
+                // check if v_obj is nan
+                if (std::isnan(v_obj)) {
+                    if (myrank == 0)
+                        std::cout << "v_obj is nan, stop inversion" << std::endl;
+                    // stop inversion
+                    break;
+                }
+
+                ///////////////
+                // model update
+                ///////////////
+
+                if (optim_method == GRADIENT_DESCENT)
+                    model_optimize(IP, grid, io, i_inv, v_obj, old_v_obj, first_src, out_main);
+                else if (optim_method == HALVE_STEPPING_MODE)
+                    v_obj_misfit = model_optimize_halve_stepping(IP, grid, io, i_inv, v_obj, first_src, out_main);
+                else if (optim_method == LBFGS_MODE) {
+                    bool found_next_step = model_optimize_lbfgs(IP, grid, io, i_inv, v_obj, first_src, out_main);
+
+                    if (!found_next_step)
+                        break;
+                }
+
+                // define old_v_obj
+                old_v_obj = v_obj;
+
+                // output objective function
+                write_objective_function(IP, i_inv, v_obj_misfit, out_main, "model update");
+
+                // since model is update. The written traveltime field should be discraded.
+                // initialize is_T_written_into_file
+                for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+                    const std::string name_sim_src = IP.get_src_name(i_src);
+
+                    if (proc_store_srcrec) // only proc_store_srcrec has the src_map object
+                        IP.src_map[name_sim_src].is_T_written_into_file = false;
+                }
+
+                // output updated model
+                if (id_sim==0) {
+                    //io.change_xdmf_obj(0); // change xmf file for next src
+                    io.change_group_name_for_model();
+
+                    // write out model info
+                    if (IP.get_if_output_in_process() || i_inv >= IP.get_max_loop_mode0()*IP.get_model_update_N_iter() - 2){
+                        io.write_vel(grid, i_inv+1);
+                        io.write_xi( grid, i_inv+1);
+                        io.write_eta(grid, i_inv+1);
+                    }
+                    //io.write_zeta(grid, i_inv); // TODO
+
+                    if (IP.get_verbose_output_level()){
+                        io.write_a(grid,   i_inv+1);
+                        io.write_b(grid,   i_inv+1);
+                        io.write_c(grid,   i_inv+1);
+                        io.write_f(grid,   i_inv+1);
+                        io.write_fun(grid, i_inv+1);
+                    }
+
+                    // // output model_parameters_inv_0000.dat
+                    // if (IP.get_if_output_model_dat()
+                    // && (IP.get_if_output_in_process() || i_inv >= IP.get_max_loop_mode0()*IP.get_model_update_N_iter() - 2))
+                    //     io.write_concerning_parameters(grid, i_inv + 1, IP);
+
+                } // end output updated model
+
+                // writeout temporary xdmf file
+                io.update_xdmf_file();
+
+                // write src rec files
+                if (IP.get_if_output_in_process_data()){
+                    IP.write_src_rec_file(model_update_step,relocation_step);
+                } else if (i_inv == IP.get_max_loop_mode0() * IP.get_model_update_N_iter()-1 || i_inv==0) {
+                    IP.write_src_rec_file(model_update_step,relocation_step);
+                }
+                model_update_step += 1;
+
+                // wait for all processes to finish
+                synchronize_all_world();
+
+                // output current state of the model
+                if (IP.get_if_output_final_model()) {
+                    // io.write_final_model(grid, IP);
+                    io.write_merged_model(grid, IP, "final_model.h5");
+                }
+                if (IP.get_if_output_middle_model()) {
+                    std::string tmp_fname = "middle_model_step_" + int2string_zero_fill(i_inv + 1) + ".h5";
+                    io.write_merged_model(grid, IP, tmp_fname);
+                }
+
+            }   // end model update in one loop
+
+
+            /////////////////////
+            // stage 2, update earthquake locations
+            /////////////////////
+
+            if(myrank == 0 && id_sim ==0){
+                std::cout   << std::endl;
+                std::cout   << "loop " << i_loop+1 << ", computing traveltime field for relocation" << std::endl;
+                std::cout   << std::endl;
+            }
+
+            // calculate traveltime for each receiver (swapped from source) and write in output file
+            if (IP.get_relocation_N_iter() > 0) // we really do relocation
+                calculate_traveltime_for_all_src_rec(IP, grid, io);
+
+
+            // initilize all earthquakes
+            if (proc_store_srcrec){
+                for(auto iter = IP.rec_map.begin(); iter != IP.rec_map.end(); iter++){
+                    iter->second.is_stop = false;
+                }
+            }
+
+            // iterate
+            for (int one_loop_i_iter = 0; one_loop_i_iter < IP.get_relocation_N_iter(); one_loop_i_iter++){
+                int i_iter = i_loop * IP.get_relocation_N_iter() + one_loop_i_iter;
+
+                if(myrank == 0 && id_sim ==0){
+                    std::cout   << std::endl;
+                    std::cout   << "loop " << i_loop+1 << ", relocation iteration " << one_loop_i_iter+1
+                                << " ( the " << i_iter+1 << "-th relocation) starting ... " << std::endl;
+                    std::cout   << std::endl;
+                }
+
+
+                // calculate gradient of objective function at sources
+                v_obj_misfit = calculate_gradient_objective_function(IP, grid, io, i_iter);
+                v_obj = v_obj_misfit[0];
+
+                // update source location
+                recs.update_source_location(IP, grid);
+
+                synchronize_all_world();
+
+
+                // output location information
+                if(id_sim == 0 && myrank == 0){
+                    std::cout << "objective function: "              << v_obj << std::endl;
+                }
+
+                // write objective functions
+                write_objective_function(IP, i_iter, v_obj_misfit, out_main, "relocation");
+
+                // write out new src_rec_file
+                if (IP.get_if_output_in_process_data()){
+                    IP.write_src_rec_file(model_update_step,relocation_step);
+                } else if (i_iter == IP.get_max_loop_mode0() * IP.get_relocation_N_iter()-1 || i_iter==0) {
+                    IP.write_src_rec_file(model_update_step,relocation_step);
+                }
+
+                // modify the receiver's location for output
+                IP.modify_swapped_source_location();
+
+                relocation_step += 1;
+            } // end relocation loop
+
+            grid.rejuvenate_abcf();     // (a,b/r^2,c/(r^2*cos^2),f/(r^2*cos)) -> (a,b,c,f)
+
+            // estimate running time
+            CUSTOMREAL time_elapsed = timer.get_t();
+            if (id_sim == 0 && myrank == 0 && i_loop < IP.get_max_loop_mode0()-1) {
+                const time_t end_time_estimated = time_elapsed / (i_loop + 1) * (IP.get_max_loop_mode0() - i_loop - 1) + timer.get_start();
+                auto will_run_time = (int)(time_elapsed/(i_loop + 2) * (IP.get_max_loop_mode0() - i_loop - 1));
+
+                std::cout << std::endl;
+                std::cout << "The program begins at " << timer.get_start_t() << std::endl;
+                std::cout << "Loop (" << i_loop + 1 << "/" << IP.get_max_loop_mode0() << ") finished at " << time_elapsed << " seconds" << std::endl;
+                std::cout << i_loop + 1 << " loop run " << timer.get_t() << " seconds, the rest of " << IP.get_max_loop_mode0() - i_loop - 1 << " iterations require " << will_run_time << " seconds." << std::endl;
+                std::cout << "The program is estimated to stop at " << timer.get_utc_from_time_t(end_time_estimated) << std::endl;
+                std::cout << std::endl;
+            }
+
+        } // end loop for model update and relocation
+    } else if (inv_mode == SIMULTANEOUS) {
+
+        for (int i_loop = 0; i_loop < IP.get_max_loop_mode1(); i_loop++){
+
+            if(myrank == 0 && id_sim ==0){
+                std::cout   << std::endl;
+                std::cout   << "loop " << i_loop+1 << ", model update and relocation starting ... " << std::endl;
+                std::cout   << std::endl;
+            }
+
+            /////////////////////
+            // stage 1, update model parameters
+            /////////////////////
+
+
+            // prepare inversion iteration group in xdmf file
+            io.prepare_grid_inv_xdmf(i_loop);
+
+                ///////////////////////////////////////////////////////
+                // run (forward and adjoint) simulation for each source
+                ///////////////////////////////////////////////////////
+
+            // run forward and adjoint simulation and calculate current objective function value and sensitivity kernel for all sources
+            line_search_mode = false;
+            // skip for the mode with sub-iteration
+            if (i_loop > 0 && optim_method != GRADIENT_DESCENT) {
+            } else {
+                bool is_save_T = true;
+                v_obj_misfit = run_simulation_one_step(IP, grid, io, i_loop, first_src, line_search_mode, is_save_T);
+                v_obj = v_obj_misfit[0];
+            }
+
+            // wait for all processes to finish
+            synchronize_all_world();
+
+            // check if v_obj is nan
+            if (std::isnan(v_obj)) {
+                if (myrank == 0)
+                    std::cout << "v_obj is nan, stop inversion" << std::endl;
+
+                // stop inversion
+                break;
+            }
+
+                ///////////////
+                // model update
+                ///////////////
+
+            if (optim_method == GRADIENT_DESCENT)
+                model_optimize(IP, grid, io, i_loop, v_obj, old_v_obj, first_src, out_main);
+            else if (optim_method == HALVE_STEPPING_MODE)
+                v_obj_misfit = model_optimize_halve_stepping(IP, grid, io, i_loop, v_obj, first_src, out_main);
+            else if (optim_method == LBFGS_MODE) {
+                bool found_next_step = model_optimize_lbfgs(IP, grid, io, i_loop, v_obj, first_src, out_main);
+
+                if (!found_next_step)
+                    break;
+            }
+
+
+            // output objective function (model update part)
+            write_objective_function(IP, i_loop, v_obj_misfit, out_main, "model update");
+
+            /////////////////////
+            // stage 2, update earthquake locations
+            /////////////////////
+
+
+            // initilize all earthquakes
+            if (proc_store_srcrec){
+                for(auto iter = IP.rec_map.begin(); iter != IP.rec_map.end(); iter++){
+                    iter->second.is_stop = false;
+                }
+            }
+
+            // calculate gradient of objective function at sources
+            v_obj_misfit = calculate_gradient_objective_function(IP, grid, io, i_loop);
+
+            // update source location
+            recs.update_source_location(IP, grid);
+
+            synchronize_all_world();
+
+            // output objective function (relocation part)
+            write_objective_function(IP, i_loop, v_obj_misfit, out_main, "relocation");
+
+
+            // define old_v_obj
+            old_v_obj   = v_obj;
+
+
+            // since model is update. The written traveltime field should be discraded.
+            // initialize is_T_written_into_file
+            for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+                const std::string name_sim_src = IP.get_src_name(i_src);
+
+                if (proc_store_srcrec) // only proc_store_srcrec has the src_map object
+                    IP.src_map[name_sim_src].is_T_written_into_file = false;
+            }
+
+            // output updated model
+            if (id_sim==0) {
+                //io.change_xdmf_obj(0); // change xmf file for next src
+                io.change_group_name_for_model();
+
+                // write out model info
+                if (IP.get_if_output_in_process() || i_loop >= IP.get_max_loop_mode1() - 2){
+                    io.write_vel(grid, i_loop+1);
+                    io.write_xi( grid, i_loop+1);
+                    io.write_eta(grid, i_loop+1);
+                }
+                //io.write_zeta(grid, i_inv); // TODO
+
+                if (IP.get_verbose_output_level()){
+                    io.write_a(grid,   i_loop+1);
+                    io.write_b(grid,   i_loop+1);
+                    io.write_c(grid,   i_loop+1);
+                    io.write_f(grid,   i_loop+1);
+                    io.write_fun(grid, i_loop+1);
+                }
+
+                // // output model_parameters_inv_0000.dat
+                // if (IP.get_if_output_model_dat()
+                // && (IP.get_if_output_in_process() || i_loop >= IP.get_max_loop_mode1() - 2))
+                //     io.write_concerning_parameters(grid, i_loop + 1, IP);
+
+            } // end output updated model
+
+            // writeout temporary xdmf file
+            io.update_xdmf_file();
+
+            // write src rec files
+            if (IP.get_if_output_in_process_data()){
+                IP.write_src_rec_file(i_loop,i_loop);
+            } else if (i_loop == IP.get_max_loop_mode1() -1 || i_loop==0) {
+                IP.write_src_rec_file(i_loop,i_loop);
+            }
+
+            // modify the receiver's location for output
+            IP.modify_swapped_source_location();
+
+            // wait for all processes to finish
+            synchronize_all_world();
+
+            grid.rejuvenate_abcf();     // (a,b/r^2,c/(r^2*cos^2),f/(r^2*cos)) -> (a,b,c,f)
+
+            // estimate running time
+            CUSTOMREAL time_elapsed = timer.get_t();
+            if (id_sim == 0 && myrank == 0 && i_loop < IP.get_max_loop_mode1()-1) {
+                const time_t end_time_estimated = time_elapsed / (i_loop + 1) * (IP.get_max_loop_mode1() - i_loop - 1) + timer.get_start();
+                auto will_run_time = (int)(time_elapsed/(i_loop + 2) * (IP.get_max_loop_mode1() - i_loop - 1));
+
+                std::cout << std::endl;
+                std::cout << "The program begins at " << timer.get_start_t() << std::endl;
+                std::cout << "Loop (" << i_loop + 1 << "/" << IP.get_max_loop_mode1() << ") finished at " << time_elapsed << " seconds" << std::endl;
+                std::cout << i_loop + 1 << " loop run " << timer.get_t() << " seconds, the rest of " << IP.get_max_loop_mode1() - i_loop - 1 << " iterations require " << will_run_time << " seconds." << std::endl;
+                std::cout << "The program is estimated to stop at " << timer.get_utc_from_time_t(end_time_estimated) << std::endl;
+                std::cout << std::endl;
+            }
+
+            // output current state of the model
+            if (IP.get_if_output_final_model()) {
+                // io.write_final_model(grid, IP);
+                io.write_merged_model(grid, IP, "final_model.h5");
+            }
+            if (IP.get_if_output_middle_model()) {
+                std::string tmp_fname = "middle_model_step_" + int2string_zero_fill(i_loop + 1) + ".h5";
+                io.write_merged_model(grid, IP, tmp_fname);
+            }
+        } // end loop for model update and relocation
+
+    } else {
+        std::cout << "Error: inv_mode is not defined" << std::endl;
+        exit(1);
+    }
+    // close xdmf file
+    io.finalize_data_output_file();
+
+    timer.stop_timer();
+    if (id_sim == 0 && myrank == 0) {
+        std::cout << std::endl;
+        std::cout << "The program begin at " << timer.get_start_t();
+        std::cout << "Inv_and_reloc end at " << timer.get_end_t();
+        std::cout << "It has run " << timer.get_elapsed_t() << " seconds in total." << std::endl;
+        std::cout << std::endl;
+    }
+
+}
+
+
+// run 1D inversion mode
+inline void run_1d_inversion(InputParams& IP, Grid& grid, IO_utils& io) {
+    OneDInversion oneDInv(IP, grid);
+
+    if(myrank == 0)
+        std::cout << "id_sim: " << id_sim << ", size of src_map: " << IP.src_map.size() << std::endl;
+
+    // estimate running time
+    Timer timer("1D_inversion", true);
+
+    // prepare objective_function file
+    std::ofstream out_main; // close() is not mandatory
+    prepare_header_line(IP, out_main);
+
+    synchronize_all_world();
+
+    /////////////////////
+    // loop for inversion
+    /////////////////////
+
+    // objective function for all src
+    std::vector<CUSTOMREAL> v_obj_misfit(20, 0.0);
+
+    for (int i_inv = 0; i_inv < IP.get_max_iter_inv(); i_inv++) {
+        if(myrank == 0 && id_sim ==0){
+            std::cout << "iteration " << i_inv << " starting ... " << std::endl;
+        }
+
+        // prepare inverstion iteration group in xdmf file
+        io.prepare_grid_inv_xdmf(i_inv);
+
+        ///////////////////////////////////////////////////////
+        // run (forward and adjoint) simulation for each source
+        ///////////////////////////////////////////////////////
+        v_obj_misfit = oneDInv.run_simulation_one_step_1dinv(IP, io, i_inv);
+
+        // wait for all processes to finish
+        synchronize_all_world();
+
+        // check if v_obj is nan
+        if (std::isnan(v_obj_misfit[0])) {
+            if (myrank == 0)
+                std::cout << "v_obj is nan, stop inversion" << std::endl;
+            // stop inversion
+            break;
+        }
+
+        // output src rec file with the result arrival times
+        if (IP.get_if_output_in_process_data()){
+            IP.write_src_rec_file(i_inv,0);
+        } else if (i_inv == IP.get_max_iter_inv()-1 || i_inv==0) {
+            IP.write_src_rec_file(i_inv,0);
+        }
+
+        ///////////////
+        // model update
+        ///////////////
+        if(myrank == 0 && id_sim ==0)
+            std::cout << "model update starting ... " << std::endl;
+        oneDInv.model_optimize_1dinv(IP, grid, io, i_inv);
+
+        // output objective function
+        write_objective_function(IP, i_inv, v_obj_misfit, out_main, "1d inversion");
+
+        // output updated model
+        if (id_sim==0) {
+            //io.change_xdmf_obj(0); // change xmf file for next src
+            io.change_group_name_for_model();
+
+            // write out model info
+            if (IP.get_if_output_in_process() || i_inv >= IP.get_max_iter_inv() - 2){
+                io.write_vel(grid, i_inv+1);
+                io.write_xi( grid, i_inv+1);
+                io.write_eta(grid, i_inv+1);
+            }
+        }
+
+        // writeout temporary xdmf file
+        io.update_xdmf_file();
+
+        // wait for all processes to finish
+        synchronize_all_world();
+
+        // estimate running time
+        CUSTOMREAL time_elapsed = timer.get_t();
+        if (id_sim == 0 && myrank == 0 && i_inv < IP.get_max_iter_inv()-1) {
+            const time_t end_time_estimated = time_elapsed / (i_inv + 1) * (IP.get_max_iter_inv() - i_inv - 1) + timer.get_start();
+            auto will_run_time = (int)(time_elapsed/(i_inv + 1) * (IP.get_max_iter_inv() - i_inv - 1));
+
+            std::cout << std::endl;
+            std::cout << "The program begins at " << timer.get_start_t() << std::endl;
+            std::cout << "Iteration (" << i_inv + 1 << "/" << IP.get_max_iter_inv() << ") finished at " << time_elapsed << " seconds" << std::endl;
+            std::cout << i_inv + 1 << " iterations run " << timer.get_t() << " seconds, the rest of " << IP.get_max_iter_inv() - i_inv - 1 << " iterations require " << will_run_time << " seconds." << std::endl;
+            std::cout << "The program is estimated to stop at " << timer.get_utc_from_time_t(end_time_estimated) << std::endl;
+            std::cout << std::endl;
+        }
+
+        // output current state of the model
+        if (IP.get_if_output_final_model()) {
+            // io.write_final_model(grid, IP);
+            io.write_merged_model(grid, IP, "final_model.h5");
+        }
+        if (IP.get_if_output_middle_model()) {
+            std::string tmp_fname = "middle_model_step_" + int2string_zero_fill(i_inv + 1) + ".h5";
+            io.write_merged_model(grid, IP, tmp_fname);
+        }
+
+        // wait for all processes to finish
+        synchronize_all_world();
+    } // end loop inverse
+
+    // close xdmf file
+    io.finalize_data_output_file();
+
+    timer.stop_timer();
+    if (id_sim == 0 && myrank == 0) {
+        std::cout << std::endl;
+        std::cout << "The program begin at " << timer.get_start_t() << std::endl;
+        std::cout << "1d model inversion ended at " << timer.get_end_t() << std::endl;
+        std::cout << "It has run " << timer.get_elapsed_t() << " seconds in total." << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+#endif // MAIN_ROUTINES_CALLING_H
--- a/include/main_routines_earthquake_relocation.h
+++ b/include/main_routines_earthquake_relocation.h
@@ -0,0 +1,149 @@
+#ifndef MAIN_ROUTINES_EARTHQUAKE_RELOCATION_H
+#define MAIN_ROUTINES_EARTHQUAKE_RELOCATION_H
+
+#include <iostream>
+#include <memory>
+#include "mpi_funcs.h"
+#include "config.h"
+#include "utils.h"
+#include "input_params.h"
+#include "grid.h"
+#include "io.h"
+#include "iterator_selector.h"
+#include "iterator.h"
+#include "iterator_legacy.h"
+#include "iterator_level.h"
+#include "source.h"
+#include "receiver.h"
+#include "main_routines_inversion_mode.h"
+
+
+// calculate traveltime field for all sources-receivers and write to file
+void calculate_traveltime_for_all_src_rec(InputParams& IP, Grid& grid, IO_utils& io){
+
+    // check if this run is in source receiver swap mode
+    // if not, stop the program
+    if (IP.get_is_srcrec_swap() == false){
+        std::cout << "Error: Source relocation mode must run with swap_src_rec = true" << std::endl;
+        exit(1);
+    }
+
+    // reinitialize factors
+    grid.reinitialize_abcf();
+
+    // prepare inverstion iteration group in xdmf file
+    io.prepare_grid_inv_xdmf(0);
+
+    ///////////////////////
+    // loop for each source
+    ///////////////////////
+
+    Source src;
+
+    // iterate over sources
+    for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+
+        const std::string name_sim_src = IP.get_src_name(i_src);
+        const int         id_sim_src   = IP.get_src_id(name_sim_src); // global source id
+
+        // check if the source is teleseismic or not
+        // because teleseismic source is not supported in this mode
+        bool is_teleseismic = IP.get_if_src_teleseismic(name_sim_src);
+
+        if (is_teleseismic){
+            std::cout << "Error: Teleseismic source is not supported in source relocation mode." << std::endl;
+            exit(1);
+        }
+
+        // set simu group id and source name for output files/dataset names
+        io.reset_source_info(id_sim_src, name_sim_src);
+
+        // set source position
+        src.set_source_position(IP, grid, is_teleseismic, name_sim_src);
+
+        // initialize iterator object
+        bool first_init = (i_src==0);
+
+        // initialize iterator object
+        std::unique_ptr<Iterator> It;
+        select_iterator(IP, grid, src, io, name_sim_src, first_init, is_teleseismic, It, false);
+
+        /////////////////////////
+        // run forward simulation
+        /////////////////////////
+
+        if (proc_store_srcrec){
+            auto srcmap_this = IP.get_src_point(name_sim_src);
+
+            std::cout << "calculating source (" << i_src+1 << "/" << IP.n_src_this_sim_group << "), name: "
+                      << name_sim_src << ", lat: " << srcmap_this.lat
+                      << ", lon: " << srcmap_this.lon << ", dep: " << srcmap_this.dep
+                      << std::endl;
+        }
+
+        bool write_tmp_T = true;
+        calculate_or_read_traveltime_field(IP, grid, io, i_src, IP.n_src_this_sim_group, first_init, It, name_sim_src, write_tmp_T);
+
+    }
+
+    // wait for all processes to finish traveltime calculation
+    synchronize_all_world();
+}
+
+
+std::vector<CUSTOMREAL> calculate_gradient_objective_function(InputParams& IP, Grid& grid, IO_utils& io, int i_iter){
+
+    Receiver recs; // here the source is swapped to receiver
+
+    // initialize source parameters (obj, kernel, )
+    recs.init_vars_src_reloc(IP);
+
+    // iterate over sources (to obtain gradient of traveltime field T and absolute traveltime, common source differential traveltime, and common receiver differential traveltime)
+    for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+
+        const std::string name_sim_src = IP.get_src_name(i_src);
+        const int         id_sim_src   = IP.get_src_id(name_sim_src); // global source id
+
+        // set simu group id and source name for output files/dataset names
+        io.reset_source_info(id_sim_src, name_sim_src);
+
+        // load travel time field on grid.T_loc
+        io.read_T_tmp(grid);
+
+        // calculate travel time at the actual source location (absolute traveltime, common source differential traveltime)
+        recs.interpolate_and_store_arrival_times_at_rec_position(IP, grid, name_sim_src);
+
+        // calculate gradient at the actual source location
+        recs.calculate_T_gradient(IP, grid, name_sim_src);
+    }
+
+    // wait for all processes to finish
+    synchronize_all_world();
+
+    // gather all the traveltime to the main process and distribute to all processes
+    // for calculating the synthetic (common receiver differential traveltime)
+    IP.gather_traveltimes_and_calc_syn_diff();
+
+
+    // iterate over sources for calculating gradient of objective function
+    for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+        const std::string name_sim_src = IP.get_src_name(i_src);
+             
+        // calculate gradient of objective function with respect to location and ortime
+        recs.calculate_grad_obj_src_reloc(IP, name_sim_src);
+    }
+
+    // compute the objective function
+    std::vector<CUSTOMREAL> obj_residual = recs.calculate_obj_reloc(IP, i_iter);
+
+    // sum grad_tau and grad_chi_k of all simulation groups
+    IP.allreduce_rec_map_grad_src();
+
+    //synchronize_all_world(); // not necessary here because allreduce is already synchronizing communication
+
+    return obj_residual;
+}
+
+
+
+#endif // MAIN_ROUTINES_EARTHQUAKE_RELOCATION_H
--- a/include/main_routines_inversion_mode.h
+++ b/include/main_routines_inversion_mode.h
@@ -0,0 +1,293 @@
+#ifndef MAIN_ROUTINES_INVERSION_MODE_H
+#define MAIN_ROUTINES_INVERSION_MODE_H
+
+#include <iostream>
+#include <memory>
+#include "mpi_funcs.h"
+#include "config.h"
+#include "utils.h"
+#include "input_params.h"
+#include "grid.h"
+#include "io.h"
+#include "iterator_selector.h"
+#include "iterator.h"
+#include "iterator_legacy.h"
+#include "iterator_level.h"
+#include "source.h"
+#include "receiver.h"
+#include "kernel.h"
+#include "model_update.h"
+#include "lbfgs.h"
+
+
+inline void calculate_or_read_traveltime_field(InputParams& IP, Grid& grid, IO_utils& io, const int i_src, const int N_src, bool first_init,
+                                               std::unique_ptr<Iterator>& It, const std::string& name_sim_src, const bool& prerun=false){
+
+    if (IP.get_is_T_written_into_file(name_sim_src)){
+        // load travel time field on grid.T_loc
+        if (myrank == 0){
+            std::cout << "id_sim: " << id_sim << ", reading source (" << i_src+1 << "/" << N_src
+                    << "), name: "
+                    << name_sim_src << ", lat: " << IP.src_map[name_sim_src].lat
+                    << ", lon: " << IP.src_map[name_sim_src].lon << ", dep: " << IP.src_map[name_sim_src].dep
+                    << std::endl;
+        }
+
+        io.read_T_tmp(grid);
+
+    } else {
+        // We need to solve eikonal equation
+        if (myrank == 0){
+            std::cout << "id_sim: " << id_sim << ", calculating source (" << i_src+1 << "/" << N_src
+                    << "), name: "
+                    << name_sim_src << ", lat: " << IP.src_map[name_sim_src].lat
+                    << ", lon: " << IP.src_map[name_sim_src].lon << ", dep: " << IP.src_map[name_sim_src].dep
+                    << std::endl;
+        }
+
+        // solve travel time field on grid.T_loc
+        It->run_iteration_forward(IP, grid, io, first_init);
+
+        // writeout travel time field
+        if (prerun) {
+            // write the temporary traveltime field into file for later use
+            io.write_T_tmp(grid);
+
+            if (proc_store_srcrec) // only proc_store_srcrec has the src_map object
+                IP.src_map[name_sim_src].is_T_written_into_file = true;
+        }
+   }
+}
+
+
+
+inline void pre_run_forward_only(InputParams& IP, Grid& grid, IO_utils& io, int i_inv){
+    if(world_rank == 0)
+        std::cout << "preparing traveltimes of common receiver data ..." << std::endl;
+
+    Source src;
+    Receiver recs;
+
+    // noted that src_map_comm_rec is the subset of src_map
+    for (int i_src = 0; i_src < IP.n_src_comm_rec_this_sim_group; i_src++){
+
+        // check if this is the first iteration of entire inversion process
+        bool first_init = (i_inv == 0 && i_src==0);
+
+        // get source info
+        std::string name_sim_src   = IP.get_src_name_comm(i_src);
+        int         id_sim_src     = IP.get_src_id(name_sim_src); // global source id
+        bool        is_teleseismic = IP.get_if_src_teleseismic(name_sim_src); // get is_teleseismic flag
+
+        // set simu group id and source name for output files/dataset names
+        io.reset_source_info(id_sim_src, name_sim_src);
+
+        // set source position
+        src.set_source_position(IP, grid, is_teleseismic, name_sim_src);
+
+     // initialize iterator object
+        std::unique_ptr<Iterator> It;
+
+        select_iterator(IP, grid, src, io, name_sim_src, first_init, is_teleseismic, It, false);
+
+        // calculate or read traveltime field
+        bool prerun_mode = true;
+        calculate_or_read_traveltime_field(IP, grid, io, i_src, IP.n_src_comm_rec_this_sim_group, first_init, It, name_sim_src, prerun_mode);
+
+        // interpolate and store  traveltime, cs_dif. For cr_dif, only store the traveltime.
+        recs.interpolate_and_store_arrival_times_at_rec_position(IP, grid, name_sim_src);
+        // CHS: At this point, all the synthesised arrival times for all the co-located stations are recorded in syn_time_map_sr. When you need to use it later, you can just look it up.
+    }
+
+
+    // wait for all processes to finish
+    synchronize_all_world();
+
+    if(world_rank == 0)
+        std::cout << "synthetic traveltimes of common receiver data have been prepared." << std::endl;
+
+    // gather all the traveltime to the main process and distribute to all processes
+    // for calculating cr_dif
+    IP.gather_traveltimes_and_calc_syn_diff();
+
+
+}
+
+
+// run forward and adjoint simulation and calculate current objective function value and sensitivity kernel if requested
+inline std::vector<CUSTOMREAL> run_simulation_one_step(InputParams& IP, Grid& grid, IO_utils& io, int i_inv, bool& first_src, bool line_search_mode, bool is_save_T){
+
+    // initialize kernel arrays
+    if (IP.get_run_mode() == DO_INVERSION || IP.get_run_mode() == INV_RELOC)
+        grid.initialize_kernels();
+
+    // reinitialize factors
+    grid.reinitialize_abcf();
+
+    ///////////////////////////////////////////////////////////////////////
+    //  compute the synthetic common receiver differential traveltime first
+    ///////////////////////////////////////////////////////////////////////
+
+    // prepare synthetic traveltime for all earthquakes, if
+    //  1. common receiver data exists;
+    //  2. we use common receiver data to update model; (cr + not swap) or (cs + swap)
+    //  3. we do inversion
+    if ( src_pair_exists &&
+              ((IP.get_use_cr() && !IP.get_is_srcrec_swap())
+            || (IP.get_use_cs() && IP.get_is_srcrec_swap()) )
+        && (IP.get_run_mode() == DO_INVERSION || IP.get_run_mode() == INV_RELOC)){
+        pre_run_forward_only(IP, grid, io, i_inv);
+    }
+    //
+    // loop over all sources
+    //
+
+    Source src;
+    Receiver recs;
+
+    if(world_rank == 0)
+        std::cout << "computing traveltime field, adjoint field and kernel ..." << std::endl;
+
+    // iterate over sources
+    for (int i_src = 0; i_src < IP.n_src_this_sim_group; i_src++){
+
+        // check if this is the first iteration of entire inversion process
+        bool first_init = (i_inv == 0 && i_src==0);
+
+        // get source info
+        const std::string name_sim_src   = IP.get_src_name(i_src);                  // source name
+        const int         id_sim_src     = IP.get_src_id(name_sim_src);             // global source id
+        bool              is_teleseismic = IP.get_if_src_teleseismic(name_sim_src); // get is_teleseismic flag
+
+        // set simu group id and source name for output files/dataset names
+        io.reset_source_info(id_sim_src, name_sim_src);
+
+        // output initial field
+        if(first_src && IP.get_if_output_source_field()) {
+            // write true solution
+            if (if_test){
+                io.write_true_solution(grid);
+            }
+
+            first_src = false;
+        }
+
+        /////////////////////////
+        // run forward simulation
+        /////////////////////////
+
+        // (re) initialize source object and set to grid
+        src.set_source_position(IP, grid, is_teleseismic, name_sim_src);
+
+        // initialize iterator object
+        std::unique_ptr<Iterator> It;
+
+        if (!hybrid_stencil_order){
+            select_iterator(IP, grid, src, io, name_sim_src, first_init, is_teleseismic, It, false);
+
+            // if traveltime field has been wriiten into the file, we choose to read the traveltime data.
+            calculate_or_read_traveltime_field(IP, grid, io, i_src, IP.n_src_this_sim_group, first_init, It, name_sim_src, is_save_T);
+
+        } else {
+            // hybrid stencil mode
+            std::cout << "\nrunnning in hybrid stencil mode\n" << std::endl;
+
+            // run 1st order forward simulation
+            std::unique_ptr<Iterator> It_pre;
+            IP.set_stencil_order(1);
+            IP.set_conv_tol(IP.get_conv_tol()*100.0);
+            select_iterator(IP, grid, src, io, name_sim_src, first_init, is_teleseismic, It_pre, false);
+            calculate_or_read_traveltime_field(IP, grid, io, i_src, IP.n_src_this_sim_group, first_init, It_pre, name_sim_src, is_save_T);
+
+            // run 3rd order forward simulation
+            IP.set_stencil_order(3);
+            IP.set_conv_tol(IP.get_conv_tol()/100.0);
+            select_iterator(IP, grid, src, io, name_sim_src, first_init, is_teleseismic, It, true);
+            calculate_or_read_traveltime_field(IP, grid, io, i_src, IP.n_src_this_sim_group, first_init, It, name_sim_src, is_save_T);
+        }
+
+        // output the result of forward simulation
+        // ignored for inversion mode.
+        if (!line_search_mode && IP.get_if_output_source_field()) {
+
+            // output T (result timetable)
+            io.write_T(grid, i_inv);
+
+            // output T0v
+            //io.write_T0v(grid,i_inv); // initial Timetable
+            // output u (true solution)
+            //if (if_test)
+            //   io.write_u(grid);  // true Timetable
+            // output tau
+            //io.write_tau(grid, i_inv); // calculated deviation
+            // output residual (residual = true_solution - result)
+            //if (if_test)
+            //    io.write_residual(grid); // this will over write the u_loc, so we need to call write_u_h5 first
+        }
+        // calculate the arrival times at each receivers
+        recs.interpolate_and_store_arrival_times_at_rec_position(IP, grid, name_sim_src);
+
+        /////////////////////////
+        // run adjoint simulation
+        /////////////////////////
+
+        // if (myrank == 0){
+        //     std::cout << "calculating adjoint field, source (" << i_src+1 << "/" << (int)IP.src_id2name.size() << "), name: "
+        //             << name_sim_src << ", lat: " << IP.src_map[name_sim_src].lat
+        //             << ", lon: " << IP.src_map[name_sim_src].lon << ", dep: " << IP.src_map[name_sim_src].dep
+        //             << std::endl;
+        // }
+
+        if (IP.get_run_mode()==DO_INVERSION || IP.get_run_mode()==INV_RELOC){
+            // calculate adjoint source
+            recs.calculate_adjoint_source(IP, name_sim_src);
+            // run iteration for adjoint field calculation
+            int adj_type = 0;   // compute adjoint field
+            It->run_iteration_adjoint(IP, grid, io, adj_type);
+            // run iteration for density of the adjoint field
+            adj_type = 1;   // compute adjoint field
+            It->run_iteration_adjoint(IP, grid, io, adj_type);
+            // calculate sensitivity kernel
+            calculate_sensitivity_kernel(grid, IP, name_sim_src);
+            if (subdom_main && !line_search_mode && IP.get_if_output_source_field()) {
+                // adjoint field will be output only at the end of subiteration
+                // output the result of adjoint simulation
+                io.write_adjoint_field(grid,i_inv);
+            }
+
+            // io.write_adjoint_field(grid,i_inv);
+
+            // check adjoint source
+            // if (proc_store_srcrec){
+            //     for (auto iter = IP.rec_map.begin(); iter != IP.rec_map.end(); iter++){
+            //         std::cout << "rec id: " << iter->second.id << ", rec name: " << iter->second.name << ", adjoint source: " << iter->second.adjoint_source << std::endl;
+            //     }
+            // }
+
+        } // end if run_mode == DO_INVERSION
+
+        // wait for all processes to finish
+        // this should not be called here, for the case that the simultaneous run group has different number of sources
+        //synchronize_all_world();
+
+    } // end for i_src
+    // synchronize all processes
+    synchronize_all_world();
+
+    // gather all the traveltime to the main process and distribute to all processes
+    // for calculating the synthetic common receiver differential traveltime
+    if ( IP.get_run_mode()==ONLY_FORWARD ||                 // case 1. if we are doing forward modeling, traveltime is not prepared for computing cr_dif data. Now we need to compute it
+        (!IP.get_use_cr() && !IP.get_is_srcrec_swap())  ||   // case 2-1, we do inversion, but we do not use cr data (cr + no swap)
+        (!IP.get_use_cs() &&  IP.get_is_srcrec_swap())){     // case 2-2, we do inversion, but we do not use cr data (cs +    swap)
+        IP.gather_traveltimes_and_calc_syn_diff();
+    }
+    // compute all residual and obj
+    std::vector<CUSTOMREAL> obj_residual = recs.calculate_obj_and_residual(IP);
+
+
+    // return current objective function value
+    return obj_residual;
+}
+
+
+#endif // MAIN_ROUTINES_INVERSION_MODE_H
--- a/include/model_optimization_routines.h
+++ b/include/model_optimization_routines.h
@@ -0,0 +1,496 @@
+#ifndef MODEL_OPTIMIZATION_ROUTINES_H
+#define MODEL_OPTIMIZATION_ROUTINES_H
+
+#include <iostream>
+#include <memory>
+#include "mpi_funcs.h"
+#include "config.h"
+#include "utils.h"
+#include "input_params.h"
+#include "grid.h"
+#include "io.h"
+#include "main_routines_calling.h"
+#include "iterator_selector.h"
+#include "iterator.h"
+#include "iterator_legacy.h"
+#include "iterator_level.h"
+#include "source.h"
+#include "receiver.h"
+#include "kernel.h"
+#include "model_update.h"
+#include "lbfgs.h"
+#include "objective_function_utils.h"
+
+
+// do model update by gradient descent
+inline void model_optimize(InputParams& IP, Grid& grid, IO_utils& io, int i_inv, \
+                    CUSTOMREAL& v_obj_inout, CUSTOMREAL& old_v_obj, bool& first_src, std::ofstream& out_main) {
+
+    // check kernel density
+    check_kernel_density(grid, IP);
+
+    // sum kernels among all simultaneous runs
+    sumup_kernels(grid);
+
+    // write out original kernels
+    if (id_sim==0 && subdom_main && IP.get_if_output_kernel() && (IP.get_if_output_in_process() || i_inv >= IP.get_max_iter_inv() - 2 || i_inv == 0)) {
+        // store kernel only in the first src datafile
+        io.change_group_name_for_model();
+
+        // write kernel
+        io.write_Ks(grid, i_inv);
+        io.write_Keta(grid, i_inv);
+        io.write_Kxi(grid, i_inv);
+
+        // write kernel density
+        io.write_Ks_density(grid, i_inv);
+        io.write_Kxi_density(grid, i_inv);
+        io.write_Keta_density(grid, i_inv);
+    }
+
+    // smooth kernels (multigrid, and kdensity normalization)
+    smooth_kernels(grid, IP);
+
+    // write out modified kernels
+    if (id_sim==0 && subdom_main && IP.get_if_output_kernel() && (IP.get_if_output_in_process() || i_inv >= IP.get_max_iter_inv() - 2 || i_inv == 0)) {
+        // store kernel only in the first src datafile
+        io.change_group_name_for_model();
+
+        // // kernel over density
+        // io.write_Ks_over_Kden(grid, i_inv);
+        // io.write_Keta_over_Kden(grid, i_inv);
+        // io.write_Kxi_over_Kden(grid, i_inv);
+
+        // kernel over density with smoothing
+        io.write_Ks_update(grid, i_inv);
+        io.write_Keta_update(grid, i_inv);
+        io.write_Kxi_update(grid, i_inv);
+
+        // density with smoothing
+        io.write_Ks_density_update(grid, i_inv);
+        io.write_Kxi_density_update(grid, i_inv);
+        io.write_Keta_density_update(grid, i_inv);
+    }
+
+    // change stepsize
+    // Option 1: the step length is modulated when obj changes.
+    if (step_method == OBJ_DEFINED){
+        if(i_inv != 0){
+            if (v_obj_inout < old_v_obj) {
+                step_length_init    = std::min((CUSTOMREAL)0.02, step_length_init);
+                if(myrank == 0 && id_sim == 0){
+                    std::cout << std::endl;
+                    std::cout << "The obj keeps decreasing, from " << old_v_obj << " to " << v_obj_inout
+                            << ", the step length is " << step_length_init << std::endl;
+                    std::cout << std::endl;
+                }
+            } else if (v_obj_inout >= old_v_obj) {
+                step_length_init    = std::max((CUSTOMREAL)0.0001, step_length_init*step_length_decay);
+                if(myrank == 0 && id_sim == 0){
+                    std::cout << std::endl;
+                    std::cout << "The obj keep increases, from " << old_v_obj << " to " << v_obj_inout
+                            << ", the step length decreases from " << step_length_init/step_length_decay
+                            << " to " << step_length_init << std::endl;
+                    std::cout << std::endl;
+                }
+            }
+        } else {
+            if(myrank == 0 && id_sim == 0){
+                std::cout << std::endl;
+                std::cout << "At the first iteration, the step length is " << step_length_init << std::endl;
+                std::cout << std::endl;
+            }
+        }
+    } else if (step_method == GRADIENT_DEFINED){
+        // Option 2: we modulate the step length according to the angle between the previous and current gradient directions.
+        // If the angle is less than XX degree, which means the model update direction is successive, we should enlarge the step size
+        // Otherwise, the step length should decrease
+        CUSTOMREAL angle = direction_change_of_model_update(grid);
+        if(i_inv != 0){
+            if (angle > step_length_gradient_angle){
+                CUSTOMREAL old_step_length = step_length_init;
+                step_length_init    = std::max((CUSTOMREAL)0.0001, step_length_init * step_length_down);
+                if(myrank == 0 && id_sim == 0){
+                    std::cout << std::endl;
+                    std::cout << "The angle between two update darections is " << angle
+                            << ". Because the angle is greater than " << step_length_gradient_angle << " degree, the step length decreases from "
+                            << old_step_length << " to " << step_length_init << std::endl;
+                    std::cout << std::endl;
+                }
+            } else if (angle <= step_length_gradient_angle) {
+                CUSTOMREAL old_step_length = step_length_init;
+                step_length_init    = std::min((CUSTOMREAL)0.02, step_length_init * step_length_up);
+                if(myrank == 0 && id_sim == 0){
+                    std::cout << std::endl;
+                    std::cout << "The angle between two update darections is " << angle
+                            << ". Because the angle is less than " << step_length_gradient_angle << " degree, the step length increases from "
+                            << old_step_length << " to " << step_length_init << std::endl;
+                    std::cout << std::endl;
+                }
+            }
+        } else {
+            if(myrank == 0 && id_sim == 0){
+                std::cout << std::endl;
+                std::cout << "At the first iteration, the step length is " << step_length_init << std::endl;
+                std::cout << std::endl;
+            }
+        }
+    } else {
+        std::cout << std::endl;
+        std::cout << "No supported method for step size change, step keep the same: " << step_length_init << std::endl;
+        std::cout << std::endl;
+    }
+
+    // broadcast the step_length
+    broadcast_cr_single(step_length_init,0);
+
+    // update the model with the initial step size
+    set_new_model(grid, step_length_init);
+
+    // make station correction
+    IP.station_correction_update(step_length_init_sc);
+
+    // writeout temporary xdmf file
+    if (IP.get_verbose_output_level())
+        io.update_xdmf_file();
+
+    synchronize_all_world();
+}
+
+
+inline std::vector<CUSTOMREAL> model_optimize_halve_stepping(InputParams& IP, Grid& grid, IO_utils& io, int i_inv, CUSTOMREAL& v_obj_inout, bool& first_src, std::ofstream& out_main) {
+
+    CUSTOMREAL step_length = step_length_init; // step size init is global variable
+    CUSTOMREAL diff_obj  = - 9999999999;
+    CUSTOMREAL v_obj_old = v_obj_inout;
+    CUSTOMREAL v_obj_new = 0.0;
+    std::vector<CUSTOMREAL> v_obj_misfit_new = std::vector<CUSTOMREAL>(10, 0.0);
+    int sub_iter_count   = 0;
+
+    // sum kernels among all simultaneous runs
+    sumup_kernels(grid);
+
+    // smooth kernels
+    smooth_kernels(grid, IP);
+
+    // backup the initial model
+    grid.back_up_fun_xi_eta_bcf();
+
+    // update the model with the initial step size
+    set_new_model(grid, step_length);
+
+
+    if (IP.get_verbose_output_level()) {
+        // store kernel only in the first src datafile
+        io.change_group_name_for_model();
+
+        // output updated velocity models
+        io.write_Ks(grid, i_inv);
+        io.write_Keta(grid, i_inv);
+        io.write_Kxi(grid, i_inv);
+
+        // output descent direction
+        io.write_Ks_update(grid, i_inv);
+        io.write_Keta_update(grid, i_inv);
+        io.write_Kxi_update(grid, i_inv);
+    }
+
+    // writeout temporary xdmf file
+    if (IP.get_verbose_output_level())
+        io.update_xdmf_file();
+
+    synchronize_all_world();
+
+    // loop till
+    while (sub_iter_count < max_sub_iterations) {
+        // check the new objective function value
+        // v_obj_new = run_simulation_one_step(IP, grid, io, i_inv, first_src, true); // run simulations with line search mode
+        v_obj_misfit_new = run_simulation_one_step(IP, grid, io, i_inv, first_src, true, false); // run simulations with line search mode
+        v_obj_new = v_obj_misfit_new[0];
+        // if the new objective function value is larger than the old one, make the step width to be half of the previous one
+        diff_obj = v_obj_new - v_obj_old;
+
+        if ( diff_obj > _0_CR // if the objective function value is larger than the old one
+          || std::abs(diff_obj/v_obj_old) > MAX_DIFF_RATIO_VOBJ) { // if the objective function reduced too much  ) {
+
+            // set step length just for writing out
+            step_length_init = step_length;
+            // print status
+            write_objective_function(IP, i_inv, v_obj_misfit_new, out_main, "sub iter");
+
+            grid.restore_fun_xi_eta_bcf();
+            step_length *= HALVE_STEP_RATIO;
+            set_new_model(grid, step_length);
+
+            sub_iter_count++;
+        } else {
+            // if the new objective function value is smaller than the old one, make the step width to be twice of the previous one
+            goto end_of_sub_iteration;
+        }
+
+    }
+
+end_of_sub_iteration:
+
+    v_obj_inout = v_obj_new;
+    step_length_init = step_length/(HALVE_STEP_RATIO)*HALVE_STEP_RESTORAION_RATIO; // use this step size for the next inversion
+
+    // write adjoint field
+    int next_i_inv = i_inv + 1;
+    if (subdom_main && IP.get_if_output_source_field())
+        io.write_adjoint_field(grid,next_i_inv);
+
+    return v_obj_misfit_new;
+
+}
+
+
+// do model update
+inline bool model_optimize_lbfgs(InputParams& IP, Grid& grid, IO_utils& io, int i_inv, CUSTOMREAL& v_obj_inout, bool& first_src, std::ofstream& out_main) {
+
+    int        subiter_count = 0;              // subiteration count
+    //CUSTOMREAL qp_0          = _0_CR;          // store initial p_k * grad(f_k) (descent_direction * gradient)
+    CUSTOMREAL qp_t          = _0_CR;          // store p_k * grad(f_k)
+    CUSTOMREAL td            = _0_CR;          // wolfes right step
+    CUSTOMREAL tg            = _0_CR;          // wolfes left step
+    CUSTOMREAL step_length   = step_length_init; // step size init is global variable
+    CUSTOMREAL v_obj_reg     = _0_CR;          // regularization term == q_t
+    //CUSTOMREAL q_0           = _0_CR;          // initial cost function
+    CUSTOMREAL q_t           = _0_CR;          // current cost function
+    std::vector<CUSTOMREAL> v_obj_misfit_new = std::vector<CUSTOMREAL>(10, 0.0);
+
+    //
+    // initialize
+    //
+    if (i_inv == 0) {
+        // initialize regularization
+        // regularization_penalty[:] = 1.0
+        init_regulaization_penalty_with_one(grid);
+
+        // calculate volume_domain = SquaredL2Norm(regularization_penalty)
+        volume_domain = compute_volume_domain(grid);
+
+        // volume_domain /= N_params
+        volume_domain /= N_params;
+
+        // weight_Tikonov = penalty_weight / volume_domain
+        weight_Tikonov = regularization_weight / volume_domain;
+        weight_Tikonov_ani = regularization_weight;
+        std::cout << "DEBUG: weight_Tikonov: " << weight_Tikonov << std::endl;
+
+        // calculate damp_weights
+        compute_damp_weights(grid);
+
+        // calculate gradient (run onestep forward+adjoint) not necessary to do here. already done
+        //v_obj_misfit_new = run_simulation_one_step(IP, grid, io, i_inv, first_src, true);
+
+        // Q0 = initial obj
+        q_0 = v_obj_inout;
+
+        // sum kernels among all simultaneous runs
+        sumup_kernels(grid);
+
+        // smooth gradient
+        smooth_kernels(grid, IP);
+
+        // calc regularization penalties
+        calculate_regularization_penalty(grid);
+
+        //// smooth calculated grad regularization penalty
+        //smooth_gradient_regularization(grid);
+
+        // calc (update) obj_regl = squaredL2Norm(regularization_penalty)
+        v_obj_reg = calculate_regularization_obj(grid);
+
+        // Q0 += 0.5*weight_Tiknov*obj_regl
+        q_0 += v_obj_reg;
+
+        // init_grad += weight_Tikonov * gadient_regularization_penalty
+        add_regularization_grad(grid);
+
+        // store model and gradient
+        store_model_and_gradient(grid, i_inv);
+
+        // log out
+        if(myrank == 0 && id_sim ==0) {
+            out_main \
+                     << std::setw(5)  << i_inv \
+              << "," << std::setw(5)  << subiter_count \
+              << "," << std::setw(15) << step_length \
+              << "," << std::setw(15) << q_0 \
+              << "," << std::setw(15) << q_t \
+              << "," << std::setw(15) << v_obj_reg \
+              << "," << std::setw(15) << qp_0 \
+              << "," << std::setw(15) << qp_t \
+              << "," << std::setw(15) << td \
+              << "," << std::setw(15) << tg \
+              << "," << std::setw(15) << wolfe_c1*qp_0 \
+              << "," << std::setw(15) << wolfe_c2*qp_0 \
+              << "," << std::setw(15) << "init" << std::endl;
+        }
+    }
+
+    // backup the initial model
+    grid.back_up_fun_xi_eta_bcf();
+
+    // update the model with the initial step size
+    if (IP.get_verbose_output_level() && id_sim==0) {
+        // store kernel only in the first src datafile
+        io.change_group_name_for_model();
+
+        // output updated velocity models
+        io.write_Ks(grid, i_inv);
+        io.write_Keta(grid, i_inv);
+        io.write_Kxi(grid, i_inv);
+
+        // output descent direction
+        io.write_Ks_descent_dir(grid, i_inv);
+        io.write_Keta_descent_dir(grid, i_inv);
+        io.write_Kxi_descent_dir(grid, i_inv);
+    }
+
+    // writeout temporary xdmf file
+    if (IP.get_verbose_output_level()&& id_sim==0)
+        io.update_xdmf_file();
+
+    //
+    // iteration
+    //
+    if (i_inv > 0) {
+        // sum kernels among all simultaneous runs
+        sumup_kernels(grid);
+
+        // smooth kernels
+        smooth_kernels(grid, IP);
+    }
+
+    // compute descent direction
+    calc_descent_direction(grid, i_inv, IP);
+
+    // smooth descent direction
+    smooth_descent_direction(grid);
+
+    // calc qp_0 = inital grad * descent direction
+    qp_0 = compute_q_k(grid);
+
+    // loop wolfes subiterations
+    bool wolfe_cond_ok = false;
+    while (wolfe_cond_ok != true) {
+
+        // initial guess for isub = 0
+        if (i_inv == 0 && subiter_count == 0)
+            initial_guess_step(grid, step_length, 1.0);
+        else if (i_inv == 1 && subiter_count == 0)
+            initial_guess_step(grid, step_length, LBFGS_RELATIVE_step_length);
+
+        //// Update model
+        set_new_model(grid, step_length);
+
+        //// calculate gradient (run onestep forward+adjoint)
+        v_obj_misfit_new = run_simulation_one_step(IP, grid, io, i_inv, first_src, true, false);
+
+        //// Qt update (=current obj)
+        q_t = v_obj_misfit_new[0];
+
+        // sum kernels among all simultaneous runs
+        sumup_kernels(grid);
+
+        //// smooth gradient
+        smooth_kernels(grid, IP);
+
+        //// calculate regularization to grad
+        calculate_regularization_penalty(grid);
+
+        //// smooth calculated grad regularization penalty
+        //smooth_gradient_regularization(grid);
+
+        //// add regularization term to Qt (Qt += 0.5 * obj_regl )
+        v_obj_reg = calculate_regularization_obj(grid);
+        q_t += v_obj_reg;
+
+        //// current grad += weight_Tikonov * gadient_regularization_penalty
+        add_regularization_grad(grid);
+
+        //// Qpt = Inner product(current_grad * descent_direction)
+        qp_t = compute_q_k(grid);
+
+        //// check wolfe conditions
+        wolfe_cond_ok = check_wolfe_cond(grid, q_0, q_t, qp_0, qp_t, td, tg, step_length);
+
+        // update the model with the initial step size
+        if (IP.get_verbose_output_level() && id_sim==0) {
+            // store kernel only in the first src datafile
+            io.change_group_name_for_model();
+
+            // gradient
+            io.write_Ks_update(grid, i_inv*100+subiter_count);
+            io.write_Keta_update(grid, i_inv*100+subiter_count);
+            io.write_Kxi_update(grid, i_inv*100+subiter_count);
+
+            // output descent direction
+            io.write_Ks_descent_dir(grid, i_inv*100+subiter_count);
+            io.write_Keta_descent_dir(grid, i_inv*100+subiter_count);
+            io.write_Kxi_descent_dir(grid, i_inv*100+subiter_count);
+
+        }
+
+        // writeout temporary xdmf file
+        if (IP.get_verbose_output_level()&& id_sim==0)
+            io.update_xdmf_file();
+
+        // log out
+        if(myrank == 0 && id_sim ==0)
+            out_main \
+                     << std::setw(5)  << i_inv \
+              << "," << std::setw(5)  << subiter_count \
+              << "," << std::setw(15) << step_length \
+              << "," << std::setw(15) << q_0 \
+              << "," << std::setw(15) << q_t \
+              << "," << std::setw(15) << v_obj_reg \
+              << "," << std::setw(15) << qp_0 \
+              << "," << std::setw(15) << qp_t \
+              << "," << std::setw(15) << td \
+              << "," << std::setw(15) << tg \
+              << "," << std::setw(15) << wolfe_c1*qp_0 \
+              << "," << std::setw(15) << wolfe_c2*qp_0 \
+              << "," << std::setw(15) << wolfe_cond_ok << std::endl;
+
+        if (wolfe_cond_ok) {
+            goto end_of_subiteration;
+        } else if (subiter_count > max_sub_iterations){
+            // reached max subiter
+            // exit
+            std::cout << "reached max subiterations" << std::endl;
+
+            return false;
+
+        } else {
+            // wolfe conditions not satisfied
+            subiter_count++;
+
+            grid.restore_fun_xi_eta_bcf();
+        }
+
+    }
+
+end_of_subiteration:
+    //// store model and gradient,
+    store_model_and_gradient(grid, i_inv+1);
+
+    //// Q0=Qt
+    q_0 = q_t;
+    qp_0 = qp_t;
+
+    step_length_init = step_length; // use current step size for the next iteration
+
+
+    if (myrank == 0)
+        std::cout << "Wolfe conditions satisfied at iteration " << subiter_count << std::endl;
+
+    return true;
+
+}
+
+
+
+#endif // MODEL_OPTIMIZATION_ROUTINES_H
--- a/include/model_update.h
+++ b/include/model_update.h
@@ -0,0 +1,485 @@
+#ifndef MODEL_UPDATE_H
+#define MODEL_UPDATE_H
+
+#include <cmath>
+#include "config.h"
+#include "grid.h"
+#include "smooth.h"
+#include "smooth_descent_dir.h"
+#include "smooth_grad_regul.h"
+#include "lbfgs.h"
+
+
+// generate smoothed kernels (K*_update_loc) from the kernels (K*_loc)
+// before doing this, K*_loc should be summed up among all simultaneous runs (by calling sumup_kernels)
+// before doing this, K*_update_loc has no meaning (unavailable)
+void smooth_kernels(Grid& grid, InputParams& IP) {
+
+    if (subdom_main){ // parallel level 3
+
+        if (id_sim==0){ // parallel level 1
+            // initiaize update params
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        grid.Ks_update_loc_previous[I2V(i,j,k)]   = grid.Ks_update_loc[I2V(i,j,k)];
+                        grid.Keta_update_loc_previous[I2V(i,j,k)] = grid.Keta_update_loc[I2V(i,j,k)];
+                        grid.Kxi_update_loc_previous[I2V(i,j,k)]  = grid.Kxi_update_loc[I2V(i,j,k)];
+                        grid.Ks_update_loc[I2V(i,j,k)]   = _0_CR;
+                        grid.Keta_update_loc[I2V(i,j,k)] = _0_CR;
+                        grid.Kxi_update_loc[I2V(i,j,k)]  = _0_CR;
+                        // grid.Ks_density_update_loc[I2V(i,j,k)]  = _0_CR;
+                        // grid.Kxi_density_update_loc[I2V(i,j,k)] = _0_CR;
+                        // grid.Keta_density_update_loc[I2V(i,j,k)] = _0_CR;
+                    }
+                }
+            }
+
+            // check kernel
+            CUSTOMREAL max_kernel = _0_CR;
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        max_kernel = std::max(max_kernel, std::abs(grid.Ks_loc[I2V(i,j,k)]));
+                    }
+                }
+            }
+            CUSTOMREAL tmp = 0;
+            allreduce_cr_single_max(max_kernel, tmp);
+            max_kernel = tmp;
+
+            if (max_kernel <= eps) {    
+                std::cout << "Error: max_kernel is near zero (less than 10^-12), check data residual and whether no data is used" << std::endl;
+                exit(1);
+            }
+
+            if (smooth_method == 0) {
+                // Ks_loc, Keta_loc, Kxi_loc to be 0 for ghost layers to eliminate the effect of overlapping boundary
+                for (int k = 0; k < loc_K; k++) {
+                    for (int j = 0; j < loc_J; j++) {
+                        for (int i = 0; i < loc_I; i++) {
+
+                            if (i == 0 && !grid.i_first()) {
+                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
+                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
+                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
+                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
+                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
+                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
+                            }
+                            if (i == loc_I-1 && !grid.i_last()) {
+                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
+                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
+                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
+                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
+                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
+                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
+                            }
+                            if (j == 0 && !grid.j_first()) {
+                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
+                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
+                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
+                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
+                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
+                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
+                            }
+                            if (j == loc_J-1 && !grid.j_last()) {
+                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
+                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
+                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
+                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
+                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
+                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
+                            }
+                            if (k == 0 && !grid.k_first()) {
+                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
+                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
+                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
+                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
+                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
+                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
+                            }
+                            if (k == loc_K-1 && !grid.k_last()) {
+                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
+                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
+                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
+                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
+                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
+                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
+                            }
+                        }
+                    }
+                }
+
+                // grid based smoothing
+                smooth_inv_kernels_orig(grid, IP);
+            } else if (smooth_method == 1) {
+                // CG smoothing
+                smooth_inv_kernels_CG(grid, smooth_lr, smooth_lt, smooth_lp);
+            }
+
+            // shared values on the boundary
+            grid.send_recev_boundary_data(grid.Ks_update_loc);
+            grid.send_recev_boundary_data(grid.Keta_update_loc);
+            grid.send_recev_boundary_data(grid.Kxi_update_loc);
+            grid.send_recev_boundary_data(grid.Ks_density_update_loc);
+            grid.send_recev_boundary_data(grid.Kxi_density_update_loc);
+            grid.send_recev_boundary_data(grid.Keta_density_update_loc);
+
+            grid.send_recev_boundary_data_kosumi(grid.Ks_update_loc);
+            grid.send_recev_boundary_data_kosumi(grid.Keta_update_loc);
+            grid.send_recev_boundary_data_kosumi(grid.Kxi_update_loc);
+            grid.send_recev_boundary_data_kosumi(grid.Ks_density_update_loc);
+            grid.send_recev_boundary_data_kosumi(grid.Kxi_density_update_loc);
+            grid.send_recev_boundary_data_kosumi(grid.Keta_density_update_loc);
+
+        } // end if id_sim == 0
+
+        // send the updated model to all the simultaneous run
+        broadcast_cr_inter_sim(grid.Ks_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Kxi_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Keta_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Ks_density_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Kxi_density_update_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Keta_density_update_loc, loc_I*loc_J*loc_K, 0);
+
+        // send the previous updated model to all the simultaneous run
+        broadcast_cr_inter_sim(grid.Ks_update_loc_previous, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Kxi_update_loc_previous, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Keta_update_loc_previous, loc_I*loc_J*loc_K, 0);
+
+    } // end if subdom_main
+
+    synchronize_all_world();
+}
+
+
+// smooth gradient regularization term
+void smooth_gradient_regularization(Grid& grid) {
+
+    if (subdom_main && id_sim==0){ // only id_sim==0 has values for these arrays
+        if (smooth_method == 0) {
+            // grid based smoothing
+            smooth_gradient_regularization_orig(grid);
+        } else if (smooth_method == 1) {
+            // CG smoothing
+            smooth_gradient_regularization_CG(grid, smooth_lr, smooth_lt, smooth_lp);
+        }
+
+        grid.send_recev_boundary_data(grid.fun_gradient_regularization_penalty_loc);
+        grid.send_recev_boundary_data(grid.eta_gradient_regularization_penalty_loc);
+        grid.send_recev_boundary_data(grid.xi_gradient_regularization_penalty_loc);
+    }
+}
+
+
+void smooth_descent_direction(Grid& grid){
+    if (subdom_main){
+       if (id_sim == 0) { // calculation of the update model is only done in the main simultaneous run
+            if (smooth_method == 0) {
+                // grid based smoothing
+                smooth_descent_dir(grid);
+            } else if (smooth_method == 1) {
+                // CG smoothing
+                smooth_descent_dir_CG(grid, smooth_lr, smooth_lt, smooth_lp);
+            }
+
+            // shared values on the boundary
+            grid.send_recev_boundary_data(  grid.Ks_descent_dir_loc);
+            grid.send_recev_boundary_data(grid.Keta_descent_dir_loc);
+            grid.send_recev_boundary_data( grid.Kxi_descent_dir_loc);
+
+        } // end if id_sim == 0
+
+        // send the updated model to all the simultaneous run
+        broadcast_cr_inter_sim(  grid.Ks_descent_dir_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim( grid.Kxi_descent_dir_loc, loc_I*loc_J*loc_K, 0);
+        broadcast_cr_inter_sim(grid.Keta_descent_dir_loc, loc_I*loc_J*loc_K, 0);
+
+    }
+}
+
+
+void calc_descent_direction(Grid& grid, int i_inv, InputParams& IP) {
+
+    if (subdom_main) {
+
+        // routines for LBFGS
+        if (optim_method == LBFGS_MODE) {
+
+            // calculate the descent direction
+            if (i_inv > 0) {
+                calculate_descent_direction_lbfgs(grid, i_inv);
+            // use gradient for the first iteration
+            } else {
+                int n_grid = loc_I*loc_J*loc_K;
+
+                // first time, descent direction = - precond * gradient
+                // inverse the gradient to fit the update scheme for LBFGS
+                for (int i = 0; i < n_grid; i++){
+                    grid.Ks_descent_dir_loc[i]   = - _1_CR* grid.Ks_update_loc[i];
+                    grid.Keta_descent_dir_loc[i] = - _1_CR* grid.Keta_update_loc[i];
+                    grid.Kxi_descent_dir_loc[i]  = - _1_CR* grid.Kxi_update_loc[i];
+                    //grid.Ks_descent_dir_loc[i]   = - _1_CR* grid.Ks_loc[i];
+                    //grid.Keta_descent_dir_loc[i] = - _1_CR* grid.Keta_loc[i];
+                    //grid.Kxi_descent_dir_loc[i]  = - _1_CR* grid.Kxi_loc[i];
+
+                }
+            }
+        } else {
+            // return error
+            std::cout << "Error: optim_method is not set to LBFGS_MODE (=2)" << std::endl;
+            exit(1);
+        }
+
+
+    } // end if subdom_main
+
+    synchronize_all_world();
+
+}
+
+CUSTOMREAL direction_change_of_model_update(Grid& grid){
+    CUSTOMREAL norm_grad = _0_CR;
+    CUSTOMREAL norm_grad_previous = _0_CR;
+    CUSTOMREAL inner_product = _0_CR;
+    CUSTOMREAL cos_angle = _0_CR;
+    CUSTOMREAL angle = _0_CR;
+    if (subdom_main) {
+        // initiaize update params
+        inner_product      = dot_product(grid.Ks_update_loc_previous, grid.Ks_update_loc, loc_I*loc_J*loc_K);
+        norm_grad          = dot_product(grid.Ks_update_loc, grid.Ks_update_loc, loc_I*loc_J*loc_K);
+        norm_grad_previous = dot_product(grid.Ks_update_loc_previous, grid.Ks_update_loc_previous, loc_I*loc_J*loc_K);
+
+        CUSTOMREAL tmp;
+        allreduce_cr_single(inner_product,tmp);
+        inner_product = tmp;
+
+        allreduce_cr_single(norm_grad,tmp);
+        norm_grad = tmp;
+
+        allreduce_cr_single(norm_grad_previous,tmp);
+        norm_grad_previous = tmp;
+
+        cos_angle = inner_product / (std::sqrt(norm_grad) * std::sqrt(norm_grad_previous));
+        angle     = acos(cos_angle) * RAD2DEG;
+    }
+
+    return angle;
+}
+
+
+void set_new_model(Grid& grid, CUSTOMREAL step_length_new, bool init_bfgs=false) {
+
+    if (subdom_main) {
+
+        // for LBFGS mode. K*_update_loc is not directly the descent direction but smoothed gradient
+        // so for non LBFGS_mode, nextstep will be calculated with K*_update_loc
+        if (optim_method != LBFGS_MODE) {
+            // // get the scaling factor
+            // CUSTOMREAL Linf_Ks = _0_CR, Linf_Keta = _0_CR, Linf_Kxi = _0_CR;
+            // for (int k = 1; k < loc_K-1; k++) {
+            //     for (int j = 1; j < loc_J-1; j++) {
+            //         for (int i = 1; i < loc_I-1; i++) {
+            //             Linf_Ks   = std::max(Linf_Ks,   std::abs(grid.Ks_update_loc[I2V(i,j,k)]));
+            //             Linf_Keta = std::max(Linf_Keta, std::abs(grid.Keta_update_loc[I2V(i,j,k)]));
+            //             Linf_Kxi  = std::max(Linf_Kxi,  std::abs(grid.Kxi_update_loc[I2V(i,j,k)]));
+            //         }
+            //     }
+            // }
+
+            // // get the maximum scaling factor among subdomains
+            // CUSTOMREAL Linf_tmp;
+            // allreduce_cr_single_max(Linf_Ks, Linf_tmp);   Linf_Ks = Linf_tmp;
+            // allreduce_cr_single_max(Linf_Keta, Linf_tmp); Linf_Keta = Linf_tmp;
+            // allreduce_cr_single_max(Linf_Kxi, Linf_tmp);  Linf_Kxi = Linf_tmp;
+
+            // CUSTOMREAL Linf_all = _0_CR;
+            // Linf_all = std::max(Linf_Ks, std::max(Linf_Keta, Linf_Kxi));
+
+            // // if (myrank == 0 && id_sim == 0)
+            // //    std::cout << "Scaling factor for all kernels: " << Linf_all << std::endl;
+            // //    std::cout << "Scaling factor for model update for Ks, Keta, Kx, stepsize: " << Linf_Ks << ", " << Linf_Keta << ", " << Linf_Kxi << ", " << step_length_new << std::endl;
+
+
+            // Linf_Ks = Linf_all;
+            // Linf_Keta = Linf_all;
+            // Linf_Kxi = Linf_all;
+
+            // kernel update has been rescaled in "smooth_inv_kernels_orig"
+
+            // update the model
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+
+                        // // update
+                        // grid.fun_loc[I2V(i,j,k)] *= (_1_CR - grid.Ks_update_loc[I2V(i,j,k)  ] / (Linf_Ks   /step_length_new) );
+                        // grid.xi_loc[I2V(i,j,k)]  -=          grid.Kxi_update_loc[I2V(i,j,k) ] / (Linf_Kxi  /step_length_new)  ;
+                        // grid.eta_loc[I2V(i,j,k)] -=          grid.Keta_update_loc[I2V(i,j,k)] / (Linf_Keta /step_length_new)  ;
+
+                        // update
+                        grid.fun_loc[I2V(i,j,k)] *= (_1_CR - grid.Ks_update_loc[I2V(i,j,k)  ] * step_length_new);
+                        grid.xi_loc[I2V(i,j,k)]  -=          grid.Kxi_update_loc[I2V(i,j,k) ] * step_length_new;
+                        grid.eta_loc[I2V(i,j,k)] -=          grid.Keta_update_loc[I2V(i,j,k)] * step_length_new;
+
+
+                        // grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];
+                        // grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];
+                        // grid.fac_f_loc[I2V(i,j,k)] =       - _2_CR * grid.eta_loc[I2V(i,j,k)];
+
+                        // grid.fun_loc[I2V(i,j,k)] *= (_1_CR);
+                        // grid.xi_loc[I2V(i,j,k)]  -= (_0_CR);
+                        // grid.eta_loc[I2V(i,j,k)] -= (_0_CR);
+
+                        // grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];
+                        // grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];
+                        // grid.fac_f_loc[I2V(i,j,k)] =       - _2_CR * grid.eta_loc[I2V(i,j,k)];
+
+                    }
+                }
+            }
+
+            grid.rejuvenate_abcf();
+
+
+        } else { // for LBFGS routine
+
+            // here all the simultaneous runs have the same values used in this routine.
+            // thus we don't need to if(id_sim==0)
+
+            CUSTOMREAL step_length = step_length_new;
+            const CUSTOMREAL factor = - _1_CR;
+
+            // update the model
+            for (int k = 0; k < loc_K; k++) {
+                for (int j = 0; j < loc_J; j++) {
+                    for (int i = 0; i < loc_I; i++) {
+                        // update
+                        grid.fun_loc[I2V(i,j,k)] *= (_1_CR - factor * grid.Ks_descent_dir_loc[I2V(i,j,k)]   * step_length);
+                        grid.xi_loc[I2V(i,j,k)]  -=          factor * grid.Kxi_descent_dir_loc[I2V(i,j,k) ] * step_length;
+                        grid.eta_loc[I2V(i,j,k)] -=          factor * grid.Keta_descent_dir_loc[I2V(i,j,k)] * step_length;
+                        //grid.fun_loc[I2V(i,j,k)] += grid.Ks_descent_dir_loc[I2V(i,j,k)]  *step_length_new;
+                        //grid.xi_loc[I2V(i,j,k)]  += grid.Kxi_descent_dir_loc[I2V(i,j,k)] *step_length_new;
+                        //grid.eta_loc[I2V(i,j,k)] += grid.Keta_descent_dir_loc[I2V(i,j,k)]*step_length_new;
+
+                        grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];
+                        grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];
+                        grid.fac_f_loc[I2V(i,j,k)] =       - _2_CR * grid.eta_loc[I2V(i,j,k)];
+                    }
+                }
+            }
+
+        } // end LBFGS model update
+
+        // shared values on the boundary
+        grid.send_recev_boundary_data(grid.fun_loc);
+        grid.send_recev_boundary_data(grid.xi_loc);
+        grid.send_recev_boundary_data(grid.eta_loc);
+        grid.send_recev_boundary_data(grid.fac_b_loc);
+        grid.send_recev_boundary_data(grid.fac_c_loc);
+        grid.send_recev_boundary_data(grid.fac_f_loc);
+
+    } // end if subdom_main
+}
+
+
+
+// compute p_k * grad(f_k)
+CUSTOMREAL compute_q_k(Grid& grid) {
+
+    CUSTOMREAL tmp_qk = _0_CR;
+
+    if (subdom_main && id_sim == 0) {
+
+        // grad * descent_direction
+        for (int k = 0; k < loc_K; k++) {
+            for (int j = 0; j < loc_J; j++) {
+                for (int i = 0; i < loc_I; i++) {
+                    // we can add here a precondition (here use *_update_loc for storing descent direction)
+                    tmp_qk += grid.Ks_update_loc[I2V(i,j,k)]   * grid.Ks_descent_dir_loc[I2V(i,j,k)]
+                            + grid.Keta_update_loc[I2V(i,j,k)] * grid.Keta_descent_dir_loc[I2V(i,j,k)]
+                            + grid.Kxi_update_loc[I2V(i,j,k)]  * grid.Kxi_descent_dir_loc[I2V(i,j,k)];
+                }
+            }
+        }
+
+        // add tmp_qk among all subdomain
+        allreduce_cr_single(tmp_qk, tmp_qk);
+    }
+
+    // share tmp_qk among all simultaneous runs
+    if (subdom_main)
+        broadcast_cr_single_inter_sim(tmp_qk,0);
+
+    return tmp_qk;
+
+}
+
+
+// check if the wolfe conditions are satisfied
+bool check_wolfe_cond(Grid& grid, \
+                    CUSTOMREAL q_0, CUSTOMREAL q_t, \
+                    CUSTOMREAL qp_0, CUSTOMREAL qp_t, \
+                    CUSTOMREAL& td, CUSTOMREAL& tg, CUSTOMREAL& step_length_sub) {
+    /*
+    q_0 : initial cost function
+    q_t : current cost function
+    q_p0 : initial grad * descent_dir
+    q_pt : current grad * descent_dir
+    td : right step size
+    tg : left step size
+    step_length_sub : current step size
+    */
+
+    bool step_accepted = false;
+
+    // check  if the wolfe conditions are satisfied and update the step_length_sub
+    CUSTOMREAL qpt = (q_t - q_0) / step_length_sub;
+
+    if (subdom_main) {
+        // good step size
+        if (qpt   <= wolfe_c1*qp_0 \
+         && wolfe_c2*qp_0 <= qp_t \
+         ){
+            if (myrank==0)
+                std::cout << "Wolfe rules:  step accepted" << std::endl;
+            step_accepted = true;
+        } else {
+            // modify the stepsize
+            if (wolfe_c1*qp_0 < qpt) {
+                td = step_length_sub;
+                if (myrank==0)
+                    std::cout << "Wolfe rules:  right step size updated." << std::endl;
+            }
+            if (qpt <= wolfe_c1*qp_0 && qp_t < wolfe_c2*qp_0) {
+                tg = step_length_sub;
+                if (myrank==0)
+                    std::cout << "Wolfe rules:  left step size updated." << std::endl;
+            }
+            //if (isZero(td)) {
+            if (td == _0_CR) {
+                step_length_sub = _2_CR * step_length_sub;
+                if (myrank==0)
+                    std::cout << "Wolfe rules:  step size too small. Increased to " << step_length_sub << std::endl;
+            } else {
+                step_length_sub = _0_5_CR * (td+tg);
+                if (myrank==0)
+                    std::cout << "Wolfe rules:  step size too large. Decreased to " << step_length_sub << std::endl;
+            }
+        }
+
+        // share step_accepted among all subdomains
+        broadcast_bool_single(step_accepted,0);
+        broadcast_cr_single(step_length_sub,0);
+    } // end if subdom_main
+
+    broadcast_bool_single_sub(step_accepted,0);
+    broadcast_cr_single_sub(step_length_sub,0);
+
+    return step_accepted;
+}
+
+
+
+#endif // MODEL_UPDATE_H
--- a/include/mpi_funcs.h
+++ b/include/mpi_funcs.h
--- a/include/objective_function_utils.h
+++ b/include/objective_function_utils.h
@@ -0,0 +1,181 @@
+#ifndef OBJECTIVE_FUNCTION_UTILS_H
+#define OBJECTIVE_FUNCTION_UTILS_H
+
+#include <iostream>
+#include <vector>
+
+#include "config.h"
+#include "input_params.h"
+
+// prepare header line of objective_funciton.txt
+inline void prepare_header_line(InputParams &IP, std::ofstream &out_main) {
+    // prepare output for iteration status
+    if(myrank == 0 && id_sim ==0){
+        out_main.open(output_dir + "/objective_function.txt");
+        if (optim_method == GRADIENT_DESCENT || optim_method == HALVE_STEPPING_MODE){
+
+            out_main << std::setw(8) << std::right << "# iter,";
+            out_main << std::setw(13) << std::right << " type,";
+
+            // if (optim_method == HALVE_STEPPING_MODE)
+            //     out_main << std::setw(8) << std::right << "subiter,";        (TODO in the future)
+            std::string tmp = "obj(";
+            if(IP.get_run_mode() == ONED_INVERSION){
+                tmp.append(std::to_string(IP.N_abs_local_data));  // for 1d inversion, only absolute time residual is considered in obj
+            } else {
+                tmp.append(std::to_string(IP.N_data));
+            }
+            tmp.append("),");
+            out_main << std::setw(20) << tmp;
+
+            tmp = "obj_abs(";
+            tmp.append(std::to_string(IP.N_abs_local_data));
+            tmp.append("),");
+            out_main << std::setw(20) << tmp;
+
+            tmp = "obj_cs_dif(";
+            if (IP.get_is_srcrec_swap())
+                tmp.append(std::to_string(IP.N_cr_dif_local_data));
+            else
+                tmp.append(std::to_string(IP.N_cs_dif_local_data));
+            tmp.append("),");
+            out_main << std::setw(20) << tmp;
+
+            tmp = "obj_cr_dif(";
+            if (IP.get_is_srcrec_swap())
+                tmp.append(std::to_string(IP.N_cs_dif_local_data));
+            else
+                tmp.append(std::to_string(IP.N_cr_dif_local_data));
+            tmp.append("),");
+            out_main << std::setw(20) << tmp;
+
+            tmp = "obj_tele(";
+            tmp.append(std::to_string(IP.N_teleseismic_data));
+            tmp.append("),");
+            out_main << std::setw(20) << tmp;
+
+            out_main << std::setw(25) << "res(mean/std),";
+
+            out_main << std::setw(25) << "res_abs(mean/std),";
+
+            out_main << std::setw(25) << "res_cs_dif(mean/std),";
+
+            out_main << std::setw(25) << "res_cr_dif(mean/std),";
+
+            out_main << std::setw(25) << "res_tele(mean/std),";
+
+            out_main << std::setw(20) << "step_length," << std::endl;
+
+        } else if (optim_method == LBFGS_MODE)
+            out_main << std::setw(6)  << "it,"        << std::setw(6)  << "subit,"  << std::setw(16) << "step_length," << std::setw(16) << "q_0," << std::setw(16) << "q_t," \
+                     << std::setw(16) << "v_obj_reg," << std::setw(16) << "qp_0,"  << std::setw(16) << "qp_t,"       << std::setw(16) << "td,"  << std::setw(16) << "tg," \
+                     << std::setw(16) << "c1*qp_0,"    << std::setw(16) << "c2*qp_0," << std::setw(6)  << "step ok,"    << std::endl;
+
+    }
+}
+
+
+inline void write_objective_function(InputParams& IP, int i_inv, std::vector<CUSTOMREAL>& v_misfit_inout, std::ofstream& out_main, std::string type) {
+    // output objective function
+    if (myrank==0 && id_sim==0) {
+        out_main << std::setw(8) << std::to_string(i_inv)+",";
+        out_main << std::setw(13) << type;
+        out_main << "," << std::setw(19) << v_misfit_inout[0];
+        out_main << "," << std::setw(19) << v_misfit_inout[1];
+        if (IP.get_is_srcrec_swap())
+            out_main << "," << std::setw(19) << v_misfit_inout[3];
+        else
+            out_main << "," << std::setw(19) << v_misfit_inout[2];
+        if (IP.get_is_srcrec_swap())
+            out_main << "," << std::setw(19) << v_misfit_inout[2];
+        else
+            out_main << "," << std::setw(19) << v_misfit_inout[3];
+        out_main << "," << std::setw(19) << v_misfit_inout[4];
+        // res
+        CUSTOMREAL mean;
+        CUSTOMREAL std;
+        std::string tmp;
+        if (IP.N_data > 0) {
+            mean = v_misfit_inout[5]/IP.N_data;
+            std  = sqrt(v_misfit_inout[6]/IP.N_data - my_square(mean));
+            tmp = std::to_string(mean);
+            tmp.append("/");
+            tmp.append(std::to_string(std));
+            out_main << "," << std::setw(24) << tmp;
+        } else {
+            out_main << "," << std::setw(24) << "0.0/0.0";
+        }
+        // res_abs
+        if (IP.N_abs_local_data > 0) {
+            mean = v_misfit_inout[7]/IP.N_abs_local_data;
+            std  = sqrt(v_misfit_inout[8]/IP.N_abs_local_data - my_square(mean));
+            tmp = std::to_string(mean);
+            tmp.append("/");
+            tmp.append(std::to_string(std));
+            out_main << "," << std::setw(24) << tmp;
+        } else {
+            out_main << "," << std::setw(24) << "0.0/0.0";
+        }
+        if (IP.get_is_srcrec_swap()){
+            if (IP.N_cr_dif_local_data > 0) {
+                mean = v_misfit_inout[11]/IP.N_cr_dif_local_data;
+                std  = sqrt(v_misfit_inout[12]/IP.N_cr_dif_local_data - my_square(mean));
+                tmp = std::to_string(mean);
+                tmp.append("/");
+                tmp.append(std::to_string(std));
+                out_main << "," << std::setw(24) << tmp;
+            } else {
+                out_main << "," << std::setw(24) << "0.0/0.0";
+            }
+            if (IP.N_cs_dif_local_data > 0) {
+                mean = v_misfit_inout[9]/IP.N_cs_dif_local_data;
+                std  = sqrt(v_misfit_inout[10]/IP.N_cs_dif_local_data - my_square(mean));
+                tmp = std::to_string(mean);
+                tmp.append("/");
+                tmp.append(std::to_string(std));
+                out_main << "," << std::setw(24) << tmp;
+            } else {
+                out_main << "," << std::setw(24) << "0.0/0.0";
+            }
+        } else {
+            if (IP.N_cs_dif_local_data > 0) {
+                mean = v_misfit_inout[9]/IP.N_cs_dif_local_data;
+                std  = sqrt(v_misfit_inout[10]/IP.N_cs_dif_local_data - my_square(mean));
+                tmp = std::to_string(mean);
+                tmp.append("/");
+                tmp.append(std::to_string(std));
+                out_main << "," << std::setw(24) << tmp;
+            } else {
+                out_main << "," << std::setw(24) << "0.0/0.0";
+            }
+            if (IP.N_cr_dif_local_data > 0) {
+                mean = v_misfit_inout[11]/IP.N_cr_dif_local_data;
+                std  = sqrt(v_misfit_inout[12]/IP.N_cr_dif_local_data - my_square(mean));
+                tmp = std::to_string(mean);
+                tmp.append("/");
+                tmp.append(std::to_string(std));
+                out_main << "," << std::setw(24) << tmp;
+            } else {
+                out_main << "," << std::setw(24) << "0.0/0.0";
+            }
+        }
+
+        if (IP.N_teleseismic_data > 0) {
+            mean = v_misfit_inout[13]/IP.N_teleseismic_data;
+            std  = sqrt(v_misfit_inout[14]/IP.N_teleseismic_data - my_square(mean));
+            tmp = std::to_string(mean);
+            tmp.append("/");
+            tmp.append(std::to_string(std));
+            out_main << "," << std::setw(24) << tmp;
+        } else {
+            out_main << "," << std::setw(24) << "0.0/0.0";
+        }
+        if(type == "model update" || type == "1d inversion")
+            out_main << "," << std::setw(19) << step_length_init << ",";
+
+        out_main << std::endl;
+    }
+}
+
+
+#endif // OBJECTIVE_FUNCTION_UTILS_H
--- a/include/oneD_inversion.h
+++ b/include/oneD_inversion.h
@@ -0,0 +1,109 @@
+#ifndef ONED_INVERSION_H
+#define ONED_INVERSION_H
+
+#include <fstream>
+#include <string>
+#include <iostream>
+
+#include "input_params.h"
+#include "grid.h"
+#include "utils.h"
+#include "config.h"
+#include "receiver.h"
+
+class OneDInversion {
+
+public:
+    OneDInversion(InputParams&, Grid&);
+    ~OneDInversion();
+
+    // grid parameters
+    int             nr_1dinv, nt_1dinv;    // number of grid points in r and epicenter distance
+    CUSTOMREAL      *r_1dinv, *t_1dinv;    // coordinates
+    CUSTOMREAL      dr_1dinv, dt_1dinv;    // grid spacing
+
+    // parameters on grid nodes (for model)
+    CUSTOMREAL      *slowness_1dinv;       // slowness
+    CUSTOMREAL      *fac_a_1dinv;          // fac_a
+    CUSTOMREAL      *fac_b_1dinv;          // fac_b
+
+    // parameters on grid nodes (for eikonal solver and adjoint solver)
+    CUSTOMREAL      *tau_1dinv;            // tau
+    CUSTOMREAL      *tau_old_1dinv;        // tau_old
+    CUSTOMREAL      *T_1dinv;              // T
+    CUSTOMREAL      *Tadj_1dinv;           // Tadj
+    CUSTOMREAL      *Tadj_density_1dinv;           // Tadj
+    CUSTOMREAL      *T0v_1dinv;            // T0v
+    CUSTOMREAL      *T0r_1dinv;            // T0r
+    CUSTOMREAL      *T0t_1dinv;            // T0t
+    bool            *is_changed_1dinv;     // is_changed
+    CUSTOMREAL      *delta_1dinv;          // delta
+
+    // parameters on grid nodes (for inversion)
+    CUSTOMREAL      *Ks_1dinv;                      // Ks
+    CUSTOMREAL      *Ks_density_1dinv;              // Ks_density
+    CUSTOMREAL      *Ks_over_Kden_1dinv;            // Ks_over_Kdensity
+    CUSTOMREAL      *Ks_multigrid_1dinv;            // (Ks_over_Kdensity) parameterized by multigrid
+    CUSTOMREAL      *Ks_multigrid_previous_1dinv;   // Ks_multigrid at previous iteration 
+
+    // parameters for optimization
+    CUSTOMREAL      v_obj;
+    CUSTOMREAL      old_v_obj;
+
+    // functions
+    std::vector<CUSTOMREAL> run_simulation_one_step_1dinv(InputParams&, IO_utils&, const int&);
+    void model_optimize_1dinv(InputParams&, Grid&, IO_utils&, const int&); 
+
+    // h5 file output
+    void write_T_1dinv(IO_utils&, const std::string&, const int&);
+    void write_Tadj_1dinv(IO_utils&, const std::string&, const int&);
+    void write_vel_1dinv(IO_utils&, const int&);
+
+private:
+
+    // for eikonal solver
+    int count_cand;
+    int ii,ii_nr,ii_n2r,ii_pr,ii_p2r,ii_nt,ii_n2t,ii_pt,ii_p2t;
+    CUSTOMREAL ar,br,at,bt,ar1,ar2,at1,at2,br1,br2,bt1,bt2;
+    CUSTOMREAL eqn_a, eqn_b, eqn_c, eqn_Delta;
+    CUSTOMREAL tmp_tau, T_r, T_t, charact_r, charact_t;
+    bool is_causality;
+    std::vector<CUSTOMREAL> canditate = std::vector<CUSTOMREAL>(60);
+
+
+    // member functions
+    // class functions
+    void generate_2d_mesh(InputParams&);
+    void deallocate_arrays();
+    void allocate_arrays();
+    void load_1d_model(Grid&);
+    int I2V_1DINV(const int&,const int&);
+    
+    // "run_simulation_one_step_1dinv" subfunctions:
+    void eikonal_solver_2d(InputParams&, int& );
+    void initialize_eikonal_array(CUSTOMREAL);
+    void FSM_2d();
+    void calculate_stencil(const int&, const int&);
+    
+    void calculate_synthetic_traveltime_and_adjoint_source(InputParams&, int& );
+    CUSTOMREAL interpolate_2d_traveltime(const CUSTOMREAL&, const CUSTOMREAL&);
+
+    void adjoint_solver_2d(InputParams&, const int&, const int&);
+    void initialize_adjoint_array(InputParams&, const int&, const int&);
+    void FSM_2d_adjoint(const int&);
+    void calculate_stencil_adj(const int&, const int&);
+
+    void initialize_kernel_1d();
+    void calculate_kernel_1d();
+
+    // "model_optimize_1dinv" subfunctions:
+    void kernel_processing_1dinv(Grid&);
+    void density_normalization_1dinv();
+    void multi_grid_parameterization_1dinv(Grid&);
+    void model_update_1dinv(const int&);
+    void determine_step_size_1dinv(const int&);
+    CUSTOMREAL norm_1dinv(const CUSTOMREAL*, const int&);
+    void generate_3d_model(Grid&);
+};
+
+#endif // ONED_INVERSION_H
--- a/include/receiver.h
+++ b/include/receiver.h
@@ -0,0 +1,36 @@
+#ifndef RECEIVER_H
+#define RECEIVER_H
+
+#include <vector>
+#include "input_params.h"
+#include "grid.h"
+
+class Receiver {
+public:
+    Receiver();
+    ~Receiver();
+
+    void interpolate_and_store_arrival_times_at_rec_position(InputParams&, Grid&, const std::string&);       // only for common receiver differential traveltime
+
+    // adjoint source
+    void calculate_adjoint_source(InputParams&, const std::string&);
+    // objective function and residual
+    std::vector<CUSTOMREAL> calculate_obj_and_residual(InputParams&);
+    // Gradient of traveltime
+    void calculate_T_gradient(InputParams&, Grid&, const std::string&);
+    // initialize variables for source relocation
+    void init_vars_src_reloc(InputParams&);
+    // Gradient of objective function
+    void calculate_grad_obj_src_reloc(InputParams&, const std::string&);
+    // objective function
+    std::vector<CUSTOMREAL> calculate_obj_reloc(InputParams&, int);
+    // update source location
+    void update_source_location(InputParams&, Grid&);
+
+private:
+    CUSTOMREAL interpolate_travel_time(Grid&, InputParams&, std::string, std::string);
+    void calculate_T_gradient_one_rec(Grid&, InputParams&, std::string, CUSTOMREAL*);
+    bool check_if_receiver_is_in_this_subdomain(Grid&, const CUSTOMREAL&, const CUSTOMREAL&, const CUSTOMREAL&);
+};
+
+#endif // RECEIVER_H
--- a/include/simd_conf.h
+++ b/include/simd_conf.h
@@ -0,0 +1,194 @@
+#ifndef SIMD_CONF_H
+#define SIMD_CONF_H
+
+#ifdef USE_SIMD
+
+// flags for self controling the SIMD usage
+#define USE_AVX     __AVX__
+#define USE_AVX512  __AVX512F__
+#define USE_ARM_SVE __ARM_FEATURE_SVE
+#define USE_NEON __ARM_NEON__
+
+// forcely make USE_AVX and USE_AVX512 to be false if USE_ARM_SVE is true
+#if USE_ARM_SVE
+#undef USE_AVX
+#define USE_AVX     0
+#undef USE_AVX512
+#define USE_AVX512  0
+#endif
+
+#if USE_AVX || USE_AVX512
+#include <immintrin.h>
+#elif USE_ARM_SVE
+#include <arm_sve.h>
+#elif USE_NEON
+#include <arm_neon.h>
+#endif
+
+
+#ifdef SINGLE_PRECISION
+
+#if USE_AVX && !USE_AVX512
+const int ALIGN = 32;
+const int NSIMD = 8;
+#define __mT __m256
+#define _mmT_set1_pT _mm256_set1_ps
+#define _mmT_loadu_pT _mm256_loadu_ps
+#define _mmT_mul_pT _mm256_mul_ps
+#define _mmT_sub_pT _mm256_sub_ps
+#define _mmT_add_pT _mm256_add_ps
+#define _mmT_div_pT _mm256_div_ps
+#define _mmT_min_pT _mm256_min_ps
+#define _mmT_sqrt_pT _mm256_sqrt_ps
+#define _mmT_store_pT _mm256_store_ps
+#define _mmT_fmadd_pT _mm256_fmadd_ps
+
+#define _mm256_cmp_pT _mm256_cmp_ps
+
+#elif USE_AVX512
+const int ALIGN = 64;
+const int NSIMD = 16;
+#define __mT __m512
+#define _mmT_set1_pT _mm512_set1_ps
+#define _mmT_loadu_pT _mm512_loadu_ps
+#define _mmT_mul_pT _mm512_mul_ps
+#define _mmT_sub_pT _mm512_sub_ps
+#define _mmT_add_pT _mm512_add_ps
+#define _mmT_div_pT _mm512_div_ps
+#define _mmT_min_pT _mm512_min_ps
+#define _mmT_max_pT _mm512_max_ps
+#define _mmT_sqrt_pT _mm512_sqrt_ps
+#define _mmT_store_pT _mm512_store_ps
+#define _mmT_fmadd_pT _mm512_fmadd_ps
+
+#define __mmaskT __mmask16
+#define _mm512_cmp_pT_mask _mm512_cmp_ps_mask
+#define _mm512_mask_blend_pT _mm512_mask_blend_ps
+#define _kand_maskT _kand_mask16
+
+
+#elif USE_ARM_SVE // NOT TESTED YET
+const int ALIGN = 8*svcntd(); // use svcntw() for float
+const int NSIMD = svcntd(); // Vector Length
+#define __mT svfloat32_t
+//#define _mmT_set1_pT svdup_f64
+//#define _mmT_loadu_pT svld1_f64_f64_f64
+//#define _mmT_mul_pT svmul_f64_m
+//#define _mmT_sub_pT svsub_f64_m
+//#define _mmT_add_pT svadd_f64_m
+//#define _mmT_div_pT svdiv_f64_m
+//#define _mmT_min_pT svmin_f64_m
+//#define _mmT_sqrt_pT svsqrt_f64_m
+//#define _mmT_store_pT svst1_f64
+
+#elif USE_NEON
+const int ALIGN = 16;
+const int NSIMD = 4;
+#define __mT float32x4_t
+#define _mmT_set1_pT vdupq_n_f32
+#define _mmT_loadu_pT vld1q_f32
+#define _mmT_mul_pT vmulq_f32
+#define _mmT_sub_pT vsubq_f32
+#define _mmT_add_pT vaddq_f32
+#define _mmT_div_pT vdivq_f32
+#define _mmT_min_pT vminq_f32
+#define _mmT_sqrt_pT vsqrtq_f32
+#define _mmT_store_pT vst1q_f32
+
+#endif // __ARM_FEATURE_SVE
+
+
+#else // DOUBLE_PRECISION
+
+#if USE_AVX && !USE_AVX512
+const int ALIGN = 32;
+const int NSIMD = 4;
+#define __mT __m256d
+#define _mmT_set1_pT _mm256_set1_pd
+#define _mmT_loadu_pT _mm256_loadu_pd
+#define _mmT_mul_pT _mm256_mul_pd
+#define _mmT_sub_pT _mm256_sub_pd
+#define _mmT_add_pT _mm256_add_pd
+#define _mmT_div_pT _mm256_div_pd
+#define _mmT_min_pT _mm256_min_pd
+#define _mmT_sqrt_pT _mm256_sqrt_pd
+#define _mmT_store_pT _mm256_store_pd
+#define _mmT_fmadd_pT _mm256_fmadd_pd
+
+#define _mm256_cmp_pT _mm256_cmp_pd
+
+#elif USE_AVX512
+const int ALIGN = 64;
+const int NSIMD = 8;
+#define __mT __m512d
+#define _mmT_set1_pT _mm512_set1_pd
+#define _mmT_loadu_pT _mm512_loadu_pd
+#define _mmT_mul_pT _mm512_mul_pd
+#define _mmT_sub_pT _mm512_sub_pd
+#define _mmT_add_pT _mm512_add_pd
+#define _mmT_div_pT _mm512_div_pd
+#define _mmT_min_pT _mm512_min_pd
+#define _mmT_max_pT _mm512_max_pd
+#define _mmT_sqrt_pT _mm512_sqrt_pd
+#define _mmT_store_pT _mm512_store_pd
+#define _mmT_fmadd_pT _mm512_fmadd_pd
+
+#define __mmaskT __mmask8
+#define _mm512_cmp_pT_mask _mm512_cmp_pd_mask
+#define _mm512_mask_blend_pT _mm512_mask_blend_pd
+#define _kand_maskT _kand_mask8
+
+
+#elif USE_ARM_SVE
+const int ALIGN = 8*svcntd(); // use svcntw() for float
+const int NSIMD = svcntd(); // Vector Length
+#define __mT svfloat64_t
+//#define _mmT_set1_pT svdup_f64
+//#define _mmT_loadu_pT svld1_f64_f64_f64
+//#define _mmT_mul_pT svmul_f64_m
+//#define _mmT_sub_pT svsub_f64_m
+//#define _mmT_add_pT svadd_f64_m
+//#define _mmT_div_pT svdiv_f64_m
+//#define _mmT_min_pT svmin_f64_m
+//#define _mmT_sqrt_pT svsqrt_f64_m
+//#define _mmT_store_pT svst1_f64
+
+#elif USE_NEON
+const int ALIGN = 16;
+const int NSIMD = 2;
+#define __mT float64x2_t
+#define _mmT_set1_pT vdupq_n_f64
+#define _mmT_loadu_pT vld1q_f64
+#define _mmT_mul_pT vmulq_f64
+#define _mmT_sub_pT vsubq_f64
+#define _mmT_add_pT vaddq_f64
+#define _mmT_div_pT vdivq_f64
+#define _mmT_min_pT vminq_f64
+#define _mmT_sqrt_pT vsqrtq_f64
+#define _mmT_store_pT vst1q_f64
+
+#endif // __ARM_FEATURE_SVE
+
+#endif // DOUBLE_PRECISION
+
+inline void print_simd_type() {
+    std::cout << "SIMD type: ";
+#if USE_AVX && !USE_AVX512
+    std::cout << "AVX" << std::endl;
+#elif USE_AVX512
+    std::cout << "AVX512" << std::endl;
+#elif USE_ARM_SVE
+    std::cout << "ARM SVE" << std::endl;
+#elif USE_NEON
+    std::cout << "NEON" << std::endl;
+#endif // __ARM_FEATURE_SVE
+}
+
+#else // USE_CUDA but not AVX dummy
+const int ALIGN = 32;
+const int NSIMD = 4;
+#endif // USE_SIMD
+
+
+
+#endif // SIMD_CONF_H
--- a/include/smooth.h
+++ b/include/smooth.h
@@ -0,0 +1,773 @@
+#ifndef SMOOTH_H
+#define SMOOTH_H
+
+#include <iostream>
+#include "grid.h"
+#include "config.h"
+#include "utils.h"
+
+void calc_inversed_laplacian(CUSTOMREAL* d, CUSTOMREAL* Ap,
+                             const int i, const int j, const int k,
+                             const CUSTOMREAL lr, const CUSTOMREAL lt, const CUSTOMREAL lp,
+                             const CUSTOMREAL dr, const CUSTOMREAL dt, const CUSTOMREAL dp) {
+    // calculate inversed laplacian operator
+    CUSTOMREAL termx = _0_CR, termy = _0_CR, termz = _0_CR;
+
+    if (i==0) {
+        termx = dp*lp/3.0 * (1/(lp*dp)*(d[I2V(i,j,k)]) - lp/(dp*dp*dp)*(-2.0*d[I2V(i,j,k)]+2.0*d[I2V(i+1,j,k)]));
+    } else if (i==loc_I-1) {
+        termx = dp*lp/3.0 * (1/(lp*dp)*(d[I2V(i,j,k)]) - lp/(dp*dp*dp)*(-2.0*d[I2V(i,j,k)]+2.0*d[I2V(i-1,j,k)]));
+    } else {
+        termx = dp*lp/3.0 * (1/(lp*dp)*(d[I2V(i,j,k)]) - lp/(dp*dp*dp)*(-2.0*d[I2V(i,j,k)]+d[I2V(i-1,j,k)]+d[I2V(i+1,j,k)]));
+    }
+
+    if (j==0) {
+        termy = dt*lt/3.0 * (1/(lt*dt)*(d[I2V(i,j,k)]) - lt/(dt*dt*dt)*(-2.0*d[I2V(i,j,k)]+2.0*d[I2V(i,j+1,k)]));
+    } else if (j==loc_J-1) {
+        termy = dt*lt/3.0 * (1/(lt*dt)*(d[I2V(i,j,k)]) - lt/(dt*dt*dt)*(-2.0*d[I2V(i,j,k)]+2.0*d[I2V(i,j-1,k)]));
+    } else {
+        termy = dt*lt/3.0 * (1/(lt*dt)*(d[I2V(i,j,k)]) - lt/(dt*dt*dt)*(-2.0*d[I2V(i,j,k)]+d[I2V(i,j-1,k)]+d[I2V(i,j+1,k)]));
+    }
+
+    if (k==0) {
+        termz = dr*lr/3.0 * (1/(lr*dr)*(d[I2V(i,j,k)]) - lr/(dr*dr*dr)*(-2.0*d[I2V(i,j,k)]+2.0*d[I2V(i,j,k+1)]));
+    } else if (k==loc_K-1) {
+        termz = dr*lr/3.0 * (1/(lr*dr)*(d[I2V(i,j,k)]) - lr/(dr*dr*dr)*(-2.0*d[I2V(i,j,k)]+2.0*d[I2V(i,j,k-1)]));
+    } else {
+        termz = dr*lr/3.0 * (1/(lr*dr)*(d[I2V(i,j,k)]) - lr/(dr*dr*dr)*(-2.0*d[I2V(i,j,k)]+d[I2V(i,j,k-1)]+d[I2V(i,j,k+1)]));
+    }
+
+    Ap[I2V(i,j,k)] = termx+termy+termz;
+}
+
+
+
+void CG_smooth(Grid& grid, CUSTOMREAL* arr_in, CUSTOMREAL* arr_out, CUSTOMREAL lr, CUSTOMREAL lt, CUSTOMREAL lp) {
+    // arr: array to be smoothed
+    // lr: smooth length on r
+    // lt: smooth length on theta
+    // lp: smooth length on phi
+
+    // arrays :
+    // x_array  model
+    // g_array  gradient
+    // d_array  descent direction
+    // Ap  stiffness scalar (laplacian)
+    // pAp = d_array*Ap
+    // rr = g_array * g_array (dot product)
+
+    const int max_iter_cg = 1000;
+    const CUSTOMREAL xtol = 0.001;
+    const bool use_scaling = true;
+
+    CUSTOMREAL dr=grid.dr, dt=grid.dt, dp=grid.dp; // in km, rad, rad
+    //CUSTOMREAL dr=_1_CR,dt=_1_CR,dp=_1_CR;
+    //debug
+    std::cout << "dr, dt, dp, lr, lt, lp = " << dr << " " << dt << " " << dp << " " << lr << " " << lt << " " << lp << std::endl;
+
+    // allocate memory
+    CUSTOMREAL* x_array = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 3000);
+    CUSTOMREAL* r_array = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 3001);
+    CUSTOMREAL* p_array = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 3002);
+    CUSTOMREAL* Ap = allocateMemory<CUSTOMREAL>(loc_I*loc_J*loc_K, 3003);
+    CUSTOMREAL pAp=_0_CR, rr_0=_0_CR, rr=_0_CR, rr_new=_0_CR, aa=_0_CR, bb=_0_CR, tmp=_0_CR;
+
+    CUSTOMREAL scaling_A=_1_CR, scaling_coeff = _1_CR;
+
+    if (use_scaling) {
+        // calculate scaling factor
+        //scaling_A = std::sqrt(_1_CR / (_8_CR * PI * lr * lt * lp));
+        // scaling coefficient for gradient
+        scaling_coeff = find_absmax(arr_in, loc_I*loc_J*loc_K);
+        tmp = scaling_coeff;
+        allreduce_cr_single_max(tmp, scaling_coeff);
+        //if (scaling_coeff == _0_CR)
+        if (isZero(scaling_coeff))
+            scaling_coeff = _1_CR;
+    }
+    // std out scaling factors
+    if (myrank == 0) {
+        std::cout << "scaling_A = " << scaling_A << std::endl;
+        std::cout << "scaling_coeff = " << scaling_coeff << std::endl;
+    }
+
+
+    // array initialization
+    for (int i=0; i<loc_I*loc_J*loc_K; i++) {
+        x_array[i] = _0_CR;                   // x
+        r_array[i] = arr_in[i]/scaling_coeff; // r = b - Ax
+        p_array[i] = r_array[i];              // p
+        Ap[i] = _0_CR;
+    }
+
+    // initial rr
+    rr = dot_product(r_array, r_array, loc_I*loc_J*loc_K);
+    tmp = rr;
+    // sum all rr among all processors
+    allreduce_cr_single(tmp, rr);
+    rr_0 = rr; // record initial rr
+
+    int k_start=0, k_end=loc_K, j_start=0, j_end=loc_J, i_start=0, i_end=loc_I;
+
+    // CG loop
+    for (int iter=0; iter<max_iter_cg; iter++) {
+
+        // calculate laplacian
+        for (int k = k_start; k < k_end; k++) {
+            for (int j = j_start; j < j_end; j++) {
+                for (int i = i_start; i < i_end; i++) {
+                    //scaling_coeff = std::max(scaling_coeff, std::abs(arr[I2V(i,j,k)]));
+
+                    // calculate inversed laplacian operator
+                    calc_inversed_laplacian(p_array,Ap,i,j,k,lr,lt,lp,dr,dt,dp);
+
+                    // scaling
+                    Ap[I2V(i,j,k)] = Ap[I2V(i,j,k)]*scaling_A;
+                }
+            }
+        }
+
+        // get the values on the boundaries
+        grid.send_recev_boundary_data(Ap);
+
+        // calculate pAp
+        pAp = dot_product(p_array, Ap, loc_I*loc_J*loc_K);
+        tmp = pAp;
+        // sum all pAp among all processors
+        allreduce_cr_single(tmp, pAp);
+
+        // compute step length
+        aa = rr / pAp;
+
+        // update x_array (model)
+        for (int i=0; i<loc_I*loc_J*loc_K; i++) {
+            x_array[i] += aa * p_array[i];
+        }
+
+        // update r_array (gradient)
+        for (int i=0; i<loc_I*loc_J*loc_K; i++) {
+            r_array[i] -= aa * Ap[i];
+        }
+
+        // update rr
+        rr_new = dot_product(r_array, r_array, loc_I*loc_J*loc_K);
+        tmp = rr_new;
+        // sum all rr among all processors
+        allreduce_cr_single(tmp, rr_new);
+
+        // update d_array (descent direction)
+        bb = rr_new / rr;
+        for (int i=0; i<loc_I*loc_J*loc_K; i++) {
+            p_array[i] = r_array[i] + bb * p_array[i];
+        }
+
+        if (myrank == 0 && iter%100==0){//} && if_verbose) {
+            std::cout << "iter: " << iter << " rr: " << rr << " rr_new: " << rr_new << " rr/rr_0: " << rr/rr_0 << " pAp: " << pAp << " aa: " << aa << " bb: " << bb << std::endl;
+        }
+
+        // update rr
+        rr = rr_new;
+
+        // check convergence
+        if (rr / rr_0 < xtol) {
+            std::cout << "CG converged in " << iter << " iterations." << std::endl;
+            break;
+        }
+
+    } // end of CG loop
+
+
+    // copy x_array to arr_out
+    for (int i=0; i<loc_I*loc_J*loc_K; i++) {
+        arr_out[i] = x_array[i]*scaling_coeff;
+    }
+
+    // deallocate
+    delete[] x_array;
+    delete[] r_array;
+    delete[] p_array;
+    delete[] Ap;
+    //delete[] Ap_tmp;
+
+}
+
+
+
+
+void smooth_inv_kernels_CG(Grid& grid, CUSTOMREAL lr, CUSTOMREAL lt, CUSTOMREAL lp) {
+    // smoothing kernels with Conjugate Gradient method
+    // lr,lt,lp: smoothing length in r, theta, phi direction
+
+    // smooth Ks
+    CG_smooth(grid, grid.Ks_loc, grid.Ks_update_loc, lr, lt, lp);
+    // smooth Keta
+    CG_smooth(grid, grid.Keta_loc, grid.Keta_update_loc, lr, lt, lp);
+    // smooth Kxi
+    CG_smooth(grid, grid.Kxi_loc, grid.Kxi_update_loc, lr, lt, lp);
+
+}
+
+
+
+
+
+// original method for smoothing kernels
+inline void smooth_inv_kernels_orig(Grid& grid, InputParams& IP) {
+    // necessary params
+    CUSTOMREAL r_r, r_t, r_p;
+    CUSTOMREAL r_r_ani, r_t_ani, r_p_ani;
+
+    int kdr = 0, jdt = 0, idp = 0;
+    int kdr_ani = 0, jdt_ani = 0, idp_ani = 0;
+
+    int k_start = 0;
+    int j_start = 0;
+    int i_start = 0;
+    int k_end   = ngrid_k;
+    int j_end   = ngrid_j;
+    int i_end   = ngrid_i;
+
+    CUSTOMREAL weight   = _1_CR;
+    CUSTOMREAL * taper  = IP.get_depth_taper();
+
+    // check final kernel density
+    for (int i_loc = 0; i_loc < loc_I; i_loc++) {
+        for (int j_loc = 0; j_loc < loc_J; j_loc++) {
+            for (int k_loc = 0; k_loc < loc_K; k_loc++) {
+                if (isNegative(grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)])){
+                    std::cout   << "Warning: grid.Ks_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " << grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)] << std::endl;
+                }
+                if (isNegative(grid.Kxi_density_loc[I2V(i_loc,j_loc,k_loc)])){
+                    std::cout << "Warning: grid.Kxi_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " << grid.Kxi_density_loc[I2V(i_loc,j_loc,k_loc)] << std::endl;
+                }
+                if (isNegative(grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)])){
+                    std::cout << "Warning: grid.Keta_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " << grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] << std::endl;
+                }
+            }
+        }
+    }
+
+    // remark: kernel density normalization is shifted from kernel to kernel_inv. 
+    // Because high kernel density may concentrated below station at single grid point. This part should be strenghthened.
+    // However, low kernel density may be distributed widely at middle depth. This part should be weakened.
+    // kernel_inv considers the integration, whereas kernel only considers one grid point.
+    // Therefore, kernel density normalization should be applied to kernel_inv, not kernel.
+
+    // // kernel density normalization
+    // for (int i_loc = 0; i_loc < loc_I; i_loc++) {
+    //     for (int j_loc = 0; j_loc < loc_J; j_loc++) {
+    //         for (int k_loc = 0; k_loc < loc_K; k_loc++) {
+    //             if(isZero(grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)])){
+    //                 // do nothing
+    //             } else {
+    //                 if (grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)] < 0){
+    //                     std::cout   << "Warning: grid.Ks_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " 
+    //                                 << grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)]                      
+    //                                 << std::endl;
+    //                 }
+    //                 grid.Ks_loc[I2V(i_loc,j_loc,k_loc)] /= std::pow(std::abs(grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)]),Kdensity_coe);
+    //             }
+                
+    //             if(isZero(grid.Kxi_density_loc[I2V(i_loc,j_loc,k_loc)])){
+    //                 // do nothing
+    //             } else {
+    //                 if (grid.Kxi_density_loc[I2V(i_loc,j_loc,k_loc)] < 0){
+    //                     std::cout << "Warning: grid.Kxi_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " << grid.Kxi_density_loc[I2V(i_loc,j_loc,k_loc)] << std::endl;
+    //                 }
+    //                 grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] /= std::pow(std::abs(grid.Kxi_density_loc[I2V(i_loc,j_loc,k_loc)]),Kdensity_coe);
+    //             }
+
+    //             if(isZero(grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)])){
+    //                 // do nothing
+    //             } else {
+    //                 if (grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] < 0){
+    //                     std::cout << "Warning: grid.Keta_density_loc[I2V(" << i_loc << "," << j_loc << "," << k_loc << ")] is less than 0, = " << grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] << std::endl;
+    //                 }
+    //                 grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] /= std::pow(std::abs(grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)]),Kdensity_coe);
+    //             }
+    //         }
+    //     }
+    // }
+
+    //
+    // sum the kernel values on the inversion grid
+    //
+
+    for (int i_grid = 0; i_grid < n_inv_grids; i_grid++) {
+
+        // init coarser kernel arrays
+        // velocity
+        for (int k = 0; k < n_inv_K_loc; k++) {
+            for (int j = 0; j < n_inv_J_loc; j++) {
+                for (int i = 0; i < n_inv_I_loc; i++) {
+                    grid.Ks_inv_loc[  I2V_INV_KNL(i,j,k)] = _0_CR;
+                    grid.Ks_density_inv_loc[I2V_INV_KNL(i,j,k)] = _0_CR;
+                }
+            }
+        }
+        // anisotropy
+        for (int k = 0; k < n_inv_K_loc_ani; k++) {
+            for (int j = 0; j < n_inv_J_loc_ani; j++) {
+                for (int i = 0; i < n_inv_I_loc_ani; i++) {
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(i,j,k)] = _0_CR;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)] = _0_CR;
+
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(i,j,k)] = _0_CR;
+                    grid.Kxi_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)] = _0_CR;
+                }
+            }
+        }
+
+
+        for (int k = k_start; k < k_end; k++) {
+            CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            r_r = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+                // increasing or decreasing order
+                if (in_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr = ii_invr;
+                    r_r = calc_ratio_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+            r_r_ani = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc_ani-1; ii_invr++){
+                // increasing or decreasing order
+                if (in_between(r_glob, grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr_ani = ii_invr;
+                    r_r_ani = calc_ratio_between(r_glob, grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+
+            // continue if r is out of the inversion grid
+            if (r_r < _0_CR || r_r_ani < _0_CR) continue;
+
+            for (int j = j_start; j < j_end; j++) {
+                CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat(); // global coordinate of t (latitude)
+                r_t = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr+1,i_grid)]*(r_r);
+                    CUSTOMREAL right = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr+1,i_grid)]*(r_r);
+                    if (in_between(t_glob, left, right)) {
+                        jdt = ii_invt;
+                        r_t = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                r_t_ani = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc_ani-1; ii_invt++){
+                    //if ((t_glob - grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt,i_grid)]) * (t_glob - grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt+1,i_grid)]) <= _0_CR) {
+                    CUSTOMREAL left  = grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt,  kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt,  kdr_ani+1,i_grid)]*(r_r_ani);
+                    CUSTOMREAL right = grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt+1,kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt+1,kdr_ani+1,i_grid)]*(r_r_ani);
+                    if (in_between(t_glob, left, right)) {
+                        jdt_ani = ii_invt;
+                        r_t_ani = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                // continue if t is out of the inversion grid
+                if (r_t < _0_CR || r_t_ani < _0_CR) continue;
+
+                for (int i = i_start; i < i_end; i++) {
+                    CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();    // global coordinate of p (longitude)
+                    r_p = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr+1,i_grid)]*(r_r);
+                        CUSTOMREAL right = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr+1,i_grid)]*(r_r);
+                        if (in_between(p_glob, left, right)) {
+                            idp = ii_invp;
+                            r_p = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    r_p_ani = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc_ani-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp,  kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp,  kdr_ani+1,i_grid)]*(r_r_ani);
+                        CUSTOMREAL right = grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp+1,kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp+1,kdr_ani+1,i_grid)]*(r_r_ani);
+                        if (in_between(p_glob, left, right)) {
+                            idp_ani = ii_invp;
+                            r_p_ani = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    // continue if p is out of the inversion grid
+                    if (r_p < _0_CR || r_p_ani < _0_CR) continue;
+
+                    // global grid id to local id
+                    int k_loc = k - grid.get_offset_k();
+                    int j_loc = j - grid.get_offset_j();
+                    int i_loc = i - grid.get_offset_i();
+
+                    // check if *_loc are inside the local subdomain
+                    if (k_loc < 0 || k_loc > loc_K-1) continue;
+                    if (j_loc < 0 || j_loc > loc_J-1) continue;
+                    if (i_loc < 0 || i_loc > loc_I-1) continue;
+
+                    // check if Ks_density_loc is 0 (no contributary kernel here)
+                    // if(isZero(grid.Ks_density_loc[I2V(i_loc,j_loc,k_loc)])) continue;
+
+                    CUSTOMREAL dxdydz = (grid.dr) * (r_glob*grid.dt) * (r_glob*cos(t_glob)*grid.dp);    // dx*dy*dz  = r^2*cos(t)dr*dt*dp
+
+                    // update Ks_inv Keta_inv Kxi_inv
+                    // -p, -t, -r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp,    jdt,    kdr)]     += (_1_CR-r_r)    *(_1_CR-r_t)    *(_1_CR-r_p)    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp,    jdt,    kdr)]     += (_1_CR-r_r)    *(_1_CR-r_t)    *(_1_CR-r_p)    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    // -p, -t, +r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp,    jdt,    kdr    +1)] += r_r    *(_1_CR-r_t)    *(_1_CR-r_p)    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp,    jdt,    kdr    +1)] += r_r    *(_1_CR-r_t)    *(_1_CR-r_p)    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+
+                    // -p, +t, -r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp,    jdt    +1,kdr)]     += (_1_CR-r_r)    *r_t    *(_1_CR-r_p)    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp,    jdt    +1,kdr)]     += (_1_CR-r_r)    *r_t    *(_1_CR-r_p)    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    // +p, -t, -r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp    +1,jdt,    kdr)]     += (_1_CR-r_r)    *(_1_CR-r_t)    *r_p    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp    +1,jdt,    kdr)]     += (_1_CR-r_r)    *(_1_CR-r_t)    *r_p    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)] += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    // -p, +t, +r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp,    jdt    +1,kdr    +1)] += r_r    *r_t    *(_1_CR-r_p)    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp,    jdt    +1,kdr    +1)] += r_r    *r_t    *(_1_CR-r_p)    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    // +p, -t, +r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp    +1,jdt,    kdr    +1)] += r_r    *(_1_CR-r_t)    *r_p    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp    +1,jdt,    kdr    +1)] += r_r    *(_1_CR-r_t)    *r_p    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)] += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    // +p, +t, -r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp    +1,jdt    +1,kdr)]     += (_1_CR-r_r)    *r_t    *r_p    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp    +1,jdt    +1,kdr)]     += (_1_CR-r_r)    *r_t    *r_p    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)] += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    // +p, +t, +r
+                    grid.Ks_inv_loc[  I2V_INV_KNL(    idp    +1,jdt    +1,kdr    +1)] += r_r    *r_t    *r_p    *grid.Ks_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*r_p_ani*grid.Keta_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*r_p_ani*grid.Kxi_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                    grid.Ks_density_inv_loc[  I2V_INV_KNL(    idp    +1,jdt    +1,kdr    +1)] += r_r    *r_t    *r_p    *grid.Ks_density_loc[  I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*r_p_ani*grid.Keta_density_loc[I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)] += r_r_ani*r_t_ani*r_p_ani*grid.Kxi_density_loc[ I2V(i_loc,j_loc,k_loc)] * dxdydz;
+                    
+                } // end for i
+            } // end for j
+        } // end for k
+
+
+        // sum over all sub-domains
+        allreduce_cr_inplace(grid.Ks_inv_loc,       n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+        allreduce_cr_inplace(grid.Keta_inv_loc,     n_inv_I_loc_ani*n_inv_J_loc_ani*n_inv_K_loc_ani);
+        allreduce_cr_inplace(grid.Kxi_inv_loc,      n_inv_I_loc_ani*n_inv_J_loc_ani*n_inv_K_loc_ani);
+
+        allreduce_cr_inplace(grid.Ks_density_inv_loc,       n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+        allreduce_cr_inplace(grid.Keta_density_inv_loc,     n_inv_I_loc_ani*n_inv_J_loc_ani*n_inv_K_loc_ani);
+        allreduce_cr_inplace(grid.Kxi_density_inv_loc,      n_inv_I_loc_ani*n_inv_J_loc_ani*n_inv_K_loc_ani);
+
+        // kernel density normalization
+        // velocity
+        for (int k = 0; k < n_inv_K_loc; k++) {
+            for (int j = 0; j < n_inv_J_loc; j++) {
+                for (int i = 0; i < n_inv_I_loc; i++) {
+                    if(isZero(grid.Ks_density_inv_loc[I2V_INV_KNL(i,j,k)])){
+                        // do nothing
+                    } else {
+                        if (isNegative(grid.Ks_density_inv_loc[I2V_INV_KNL(i,j,k)])){
+                            std::cout   << "Warning: grid.Ks_density_inv_loc[I2V_INV_KNL(" << i << "," << j << "," << k << ")] is less than 0, = " 
+                                        << grid.Ks_density_inv_loc[I2V_INV_KNL(i,j,k)]   
+                                        << ", using absolute value instead."                   
+                                        << std::endl;
+                        }
+                        grid.Ks_inv_loc[I2V_INV_KNL(i,j,k)] /= std::pow(std::abs(grid.Ks_density_inv_loc[I2V_INV_KNL(i,j,k)]),Kdensity_coe);
+                    }
+                }
+            }
+        }
+        // anisotropy
+        for (int k = 0; k < n_inv_K_loc_ani; k++) {
+            for (int j = 0; j < n_inv_J_loc_ani; j++) {
+                for (int i = 0; i < n_inv_I_loc_ani; i++) {
+                    if(isZero(grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)])){
+                        // do nothing
+                    } else {
+                        if (isNegative(grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)])){
+                            std::cout   << "Warning: grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(" << i << "," << j << "," << k << ")] is less than 0, = " 
+                                        << grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)]   
+                                        << ", using absolute value instead."                   
+                                        << std::endl;
+                        }
+                        grid.Keta_inv_loc[I2V_INV_ANI_KNL(i,j,k)] /= std::pow(std::abs(grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)]),Kdensity_coe);
+                    }
+
+                    if(isZero(grid.Kxi_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)])){
+                        // do nothing
+                    } else {
+                        if (isNegative(grid.Kxi_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)])){
+                            std::cout   << "Warning: grid.Kxi_density_inv_loc[I2V_INV_ANI_KNL(" << i << "," << j << "," << k << ")] is less than 0, = " 
+                                        << grid.Kxi_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)]   
+                                        << ", using absolute value instead."                   
+                                        << std::endl;
+                        }
+                        grid.Kxi_inv_loc[I2V_INV_ANI_KNL(i,j,k)] /= std::pow(std::abs(grid.Kxi_density_inv_loc[I2V_INV_ANI_KNL(i,j,k)]),Kdensity_coe);
+                    }
+                }
+            }
+        }
+
+        //
+        // update factors
+        //
+        for (int k = k_start; k < k_end; k++) {
+            CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            r_r = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+                if (in_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr = ii_invr;
+                    r_r = calc_ratio_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+
+            r_r_ani = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc_ani-1; ii_invr++){
+                // increasing or decreasing order
+                if (in_between(r_glob, grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr_ani = ii_invr;
+                    r_r_ani = calc_ratio_between(r_glob, grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r_ani.arr[I2V_INV_ANI_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+            // depth model update taper
+            CUSTOMREAL depth = radius2depth(r_glob);
+            if (depth < taper[0]) {     // weight = 0;
+                weight = _0_CR;
+            } else if (depth < taper[1]) {
+                weight = (_1_CR - std::cos(PI*(depth - taper[0])/(taper[1] - taper[0]))) / _2_CR;
+            } else {
+                weight = _1_CR;
+            }
+
+            // continue if r is out of the inversion grid
+            if (r_r < _0_CR || r_r_ani < _0_CR) continue;
+
+            for (int j = j_start; j < j_end; j++) {
+                CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat();
+                r_t = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr+1,i_grid)]*(r_r);
+                    CUSTOMREAL right = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr+1,i_grid)]*(r_r);
+                    if (in_between(t_glob, left, right)) {
+                        jdt = ii_invt;
+                        r_t = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                r_t_ani = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc_ani-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt,  kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt,  kdr_ani+1,i_grid)]*(r_r_ani);
+                    CUSTOMREAL right = grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt+1,kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->t_ani.arr[I2V_INV_ANI_GRIDS_1DJ(ii_invt+1,kdr_ani+1,i_grid)]*(r_r_ani);
+                    if (in_between(t_glob, left, right)) {
+                        jdt_ani = ii_invt;
+                        r_t_ani = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                // continue if t is out of the inversion grid
+                if (r_t < _0_CR || r_t_ani < _0_CR) continue;
+
+                for (int i = i_start; i < i_end; i++) {
+                    CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();
+                    r_p = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr+1,i_grid)]*(r_r);
+                        CUSTOMREAL right = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr+1,i_grid)]*(r_r);
+                        if (in_between(p_glob, left, right)) {
+                            idp = ii_invp;
+                            r_p = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    r_p_ani = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc_ani-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp,  kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp,  kdr_ani+1,i_grid)]*(r_r_ani);
+                        CUSTOMREAL right = grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp+1,kdr_ani,i_grid)]*(1-r_r_ani) + grid.inv_grid->p_ani.arr[I2V_INV_ANI_GRIDS_1DI(ii_invp+1,kdr_ani+1,i_grid)]*(r_r_ani);
+                        if (in_between(p_glob, left, right)) {
+                            idp_ani = ii_invp;
+                            r_p_ani = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    // continue if p is out of the inversion grid
+                    if (r_p < _0_CR || r_p_ani < _0_CR) continue;
+
+                    // global grid id to local id
+                    int k_loc = k - grid.get_offset_k();
+                    int j_loc = j - grid.get_offset_j();
+                    int i_loc = i - grid.get_offset_i();
+
+                    // check if *_loc are inside the local subdomain
+                    if (k_loc < 0 || k_loc > loc_K-1) continue;
+                    if (j_loc < 0 || j_loc > loc_J-1) continue;
+                    if (i_loc < 0 || i_loc > loc_I-1) continue;
+
+                    CUSTOMREAL pert_Ks   = 0.0;
+                    CUSTOMREAL pert_Keta = 0.0;
+                    CUSTOMREAL pert_Kxi  = 0.0;
+                    CUSTOMREAL pert_Ks_density  = 0.0;
+                    CUSTOMREAL pert_Keta_density = 0.0;
+                    CUSTOMREAL pert_Kxi_density  = 0.0;
+
+                    pert_Ks             += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr)];
+                    pert_Keta           += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)];
+                    pert_Kxi            += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)];
+                    pert_Ks_density     += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp,jdt,kdr)];
+                    pert_Keta_density   += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)];
+                    pert_Kxi_density    += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani)];
+
+                    pert_Ks             += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr+1)];
+                    pert_Keta           += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)];
+                    pert_Kxi            += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)];
+                    pert_Ks_density     += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp,jdt,kdr+1)];
+                    pert_Keta_density   += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)];
+                    pert_Kxi_density    += r_r_ani*(_1_CR-r_t_ani)*(_1_CR-r_p_ani)*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani,kdr_ani+1)];
+                    
+                    pert_Ks             += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr)];
+                    pert_Keta           += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)];
+                    pert_Kxi            += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)];
+                    pert_Ks_density     += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr)];
+                    pert_Keta_density   = (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)];
+                    pert_Kxi_density    += (_1_CR-r_r_ani)*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani)];
+                    
+                    pert_Ks             += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr)];
+                    pert_Keta           += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)];
+                    pert_Kxi            += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)];
+                    pert_Ks_density     += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr)];
+                    pert_Keta_density   += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)];
+                    pert_Kxi_density    += (_1_CR-r_r_ani)*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani)];
+                    
+                    pert_Ks             += r_r*r_t*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr+1)];
+                    pert_Keta           += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)];
+                    pert_Kxi            += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)];
+                    pert_Ks_density     += r_r*r_t*(_1_CR-r_p)*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr+1)];
+                    pert_Keta_density   += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)];
+                    pert_Kxi_density    += r_r_ani*r_t_ani*(_1_CR-r_p_ani)*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani,jdt_ani+1,kdr_ani+1)];
+                    
+                    pert_Ks             += r_r*(_1_CR-r_t)*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr+1)];
+                    pert_Keta           += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)];
+                    pert_Kxi            += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)];
+                    pert_Ks_density     += r_r*(_1_CR-r_t)*r_p*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr+1)];
+                    pert_Keta_density   += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)];
+                    pert_Kxi_density    += r_r_ani*(_1_CR-r_t_ani)*r_p_ani*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani,kdr_ani+1)];
+                    
+                    pert_Ks             += (_1_CR-r_r)*r_t*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr)];
+                    pert_Keta           += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)];
+                    pert_Kxi            += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)];
+                    pert_Ks_density     += (_1_CR-r_r)*r_t*r_p*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr)];
+                    pert_Keta_density   += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)];
+                    pert_Kxi_density    += (_1_CR-r_r_ani)*r_t_ani*r_p_ani*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani)];
+
+                    pert_Ks             += r_r*r_t*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+                    pert_Keta           += r_r_ani*r_t_ani*r_p_ani*grid.Keta_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)];
+                    pert_Kxi            += r_r_ani*r_t_ani*r_p_ani*grid.Kxi_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)];
+                    pert_Ks_density     += r_r*r_t*r_p*grid.Ks_density_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr+1)];   
+                    pert_Keta_density   += r_r_ani*r_t_ani*r_p_ani*grid.Keta_density_inv_loc[I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)];
+                    pert_Kxi_density    += r_r_ani*r_t_ani*r_p_ani*grid.Kxi_density_inv_loc[ I2V_INV_ANI_KNL(idp_ani+1,jdt_ani+1,kdr_ani+1)];
+
+                    // update para
+                    grid.Ks_update_loc[             I2V(i_loc,j_loc,k_loc)] += weight * pert_Ks;
+                    grid.Keta_update_loc[           I2V(i_loc,j_loc,k_loc)] += weight * pert_Keta;
+                    grid.Kxi_update_loc[            I2V(i_loc,j_loc,k_loc)] += weight * pert_Kxi;
+                    grid.Ks_density_update_loc[     I2V(i_loc,j_loc,k_loc)] += weight * pert_Ks_density;
+                    grid.Keta_density_update_loc[   I2V(i_loc,j_loc,k_loc)] += weight * pert_Keta_density;
+                    grid.Kxi_density_update_loc[    I2V(i_loc,j_loc,k_loc)] += weight * pert_Kxi_density;
+
+                } // end for i
+            } // end for j
+        } // end for k
+
+    } // end i_grid
+
+
+    //
+    // rescale kernel update to -1 ~ 1
+    //
+    // get the scaling factor
+    CUSTOMREAL Linf_Ks = _0_CR, Linf_Keta = _0_CR, Linf_Kxi = _0_CR;
+    // CUSTOMREAL Linf_Kden = _0_CR;
+    for (int k = 1; k < loc_K-1; k++) {
+        for (int j = 1; j < loc_J-1; j++) {
+            for (int i = 1; i < loc_I-1; i++) {
+                Linf_Ks   = std::max(Linf_Ks,   std::abs(grid.Ks_update_loc[I2V(i,j,k)]));
+                Linf_Keta = std::max(Linf_Keta, std::abs(grid.Keta_update_loc[I2V(i,j,k)]));
+                Linf_Kxi  = std::max(Linf_Kxi,  std::abs(grid.Kxi_update_loc[I2V(i,j,k)]));
+            }
+        }
+    }
+
+    // get the maximum scaling factor among subdomains
+    CUSTOMREAL Linf_tmp;
+    allreduce_cr_single_max(Linf_Ks, Linf_tmp);   Linf_Ks = Linf_tmp;
+    allreduce_cr_single_max(Linf_Keta, Linf_tmp); Linf_Keta = Linf_tmp;
+    allreduce_cr_single_max(Linf_Kxi, Linf_tmp);  Linf_Kxi = Linf_tmp;
+
+    CUSTOMREAL Linf_all = _0_CR;
+    Linf_all = std::max(Linf_Ks, std::max(Linf_Keta, Linf_Kxi));
+
+    Linf_Ks = Linf_all;
+    Linf_Keta = Linf_all;
+    Linf_Kxi = Linf_all;
+
+    // rescale the kernel update
+    for (int k = 0; k < loc_K; k++) {
+        for (int j = 0; j < loc_J; j++) {
+            for (int i = 0; i < loc_I; i++) {
+                grid.Ks_update_loc[I2V(i,j,k)]      /= Linf_Ks;
+                grid.Keta_update_loc[I2V(i,j,k)]    /= Linf_Keta;
+                grid.Kxi_update_loc[I2V(i,j,k)]     /= Linf_Kxi;
+            }
+        }
+    }
+}
+
+
+#endif
--- a/include/smooth_descent_dir.h
+++ b/include/smooth_descent_dir.h
@@ -0,0 +1,319 @@
+#ifndef SMOOTH_DESCENT_DIR_H
+#define SMOOTH_DESCENT_DIR_H
+
+#include <iostream>
+#include "grid.h"
+#include "config.h"
+#include "smooth.h"
+
+
+inline void smooth_descent_dir_CG(Grid& grid, const CUSTOMREAL lr, const CUSTOMREAL lt, const CUSTOMREAL lp) {
+    // smoothing kernels with Conjugate Gradient method
+    // lr,lt,lp: smoothing length in r, theta, phi direction
+
+    // smooth Ks
+    CG_smooth(grid, grid.Ks_descent_dir_loc, grid.Ks_descent_dir_loc, lr, lt, lp);
+    // smooth Keta
+    CG_smooth(grid, grid.Keta_descent_dir_loc, grid.Keta_descent_dir_loc, lr, lt, lp);
+    // smooth Kxi
+    CG_smooth(grid, grid.Kxi_descent_dir_loc, grid.Kxi_descent_dir_loc, lr, lt, lp);
+
+}
+
+
+// original method for smoothing kernels
+inline void smooth_descent_dir(Grid& grid){
+    // necessary params
+    CUSTOMREAL r_r, r_t, r_p;
+    int kdr, jdt, idp;
+
+    int k_start = 0;
+    int j_start = 0;
+    int i_start = 0;
+    int k_end   = ngrid_k;
+    int j_end   = ngrid_j;
+    int i_end   = ngrid_i;
+
+    //
+    // sum the kernel values on the inversion grid
+    //
+
+    for (int i_grid = 0; i_grid < n_inv_grids; i_grid++) {
+
+        // init coarser kernel arrays
+        for (int k = 0; k < n_inv_K_loc; k++) {
+            for (int j = 0; j < n_inv_J_loc; j++) {
+                for (int i = 0; i < n_inv_I_loc; i++) {
+                    grid.Ks_inv_loc[  I2V_INV_KNL(i,j,k)] = _0_CR;
+                    grid.Keta_inv_loc[I2V_INV_KNL(i,j,k)] = _0_CR;
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(i,j,k)] = _0_CR;
+                }
+            }
+        }
+
+        for (int k = k_start; k < k_end; k++) {
+            // CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            // r_r = -_1_CR;
+            // for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+            //     if (r_glob > grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)] && r_glob <= grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]) {
+            //         kdr = ii_invr;
+            //         r_r = (r_glob - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]) / (grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)] - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]);
+            //         break;
+            //     }
+            // }
+            CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            r_r = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+                // increasing or decreasing order
+                if (in_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr = ii_invr;
+                    r_r = calc_ratio_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+            // continue if r is out of the inversion grid
+            if (r_r < 0) continue;
+
+            for (int j = j_start; j < j_end; j++) {
+                // CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat();
+                // r_t = -_1_CR;
+                // for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                //     if (t_glob > grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)] && t_glob <= grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)]) {
+                //         jdt = ii_invt;
+                //         r_t = (t_glob - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]) / (grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)] - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]);
+                //         break;
+                //     }
+                // }
+                CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat(); // global coordinate of t (latitude)
+                r_t = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr+1,i_grid)]*(r_r);
+                    CUSTOMREAL right = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr+1,i_grid)]*(r_r);
+                    if (in_between(t_glob, left, right)) {
+                        jdt = ii_invt;
+                        r_t = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                // continue if t is out of the inversion grid
+                if (r_t < 0) continue;
+
+                for (int i = i_start; i < i_end; i++) {
+                    // CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();
+                    // r_p = -_1_CR;
+                    // for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                    //     if (p_glob > grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)] && p_glob <= grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)]) {
+                    //         idp = ii_invp;
+                    //         r_p = (p_glob - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]) / (grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)] - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]);
+                    //         break;
+                    //     }
+                    // }
+                    CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();    // global coordinate of p (longitude)
+                    r_p = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(r_r);
+                        CUSTOMREAL right = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(r_r);
+                        if (in_between(p_glob, left, right)) {
+                            idp = ii_invp;
+                            r_p = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    // continue if p is out of the inversion grid
+                    if (r_p < 0) continue;
+
+                    // global grid id to local id
+                    int k_loc = k - grid.get_offset_k();
+                    int j_loc = j - grid.get_offset_j();
+                    int i_loc = i - grid.get_offset_i();
+
+                    // check if *_loc are inside the local subdomain
+                    if (k_loc < 0 || k_loc > loc_K-1) continue;
+                    if (j_loc < 0 || j_loc > loc_J-1) continue;
+                    if (i_loc < 0 || i_loc > loc_I-1) continue;
+
+                    // update Ks_inv Keta_inv Kxi_inv
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr+1)] += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr+1)] += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr+1)] += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr)] += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr)] += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr)] += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr+1)] += r_r*r_t*(_1_CR-r_p)*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr+1)] += r_r*r_t*(_1_CR-r_p)*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr+1)] += r_r*r_t*(_1_CR-r_p)*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr+1)] += r_r*(_1_CR-r_t)*r_p*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr+1)] += r_r*(_1_CR-r_t)*r_p*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr+1)] += r_r*(_1_CR-r_t)*r_p*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr)] += (_1_CR-r_r)*r_t*r_p*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr)] += (_1_CR-r_r)*r_t*r_p*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr)] += (_1_CR-r_r)*r_t*r_p*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr+1)] += r_r*r_t*r_p*grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr+1)] += r_r*r_t*r_p*grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr+1)] += r_r*r_t*r_p*grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                } // end for i
+            } // end for j
+        } // end for k
+
+        // sum over all sub-domains
+        allreduce_cr_inplace(grid.Ks_inv_loc,   n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+        allreduce_cr_inplace(grid.Keta_inv_loc, n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+        allreduce_cr_inplace(grid.Kxi_inv_loc,  n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+
+        //
+        // update factors
+        //
+        for (int k = k_start; k < k_end; k++) {
+            // CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            // r_r = -_1_CR;
+            // for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+            //     if (r_glob > grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)] && r_glob <= grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]) {
+            //         kdr = ii_invr;
+            //         r_r = (r_glob - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]) / (grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)] - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]);
+            //         break;
+            //     }
+            // }
+            CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            r_r = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+                if (in_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr = ii_invr;
+                    r_r = calc_ratio_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+
+            // continue if r is out of the inversion grid
+            if (r_r < 0) continue;
+
+            for (int j = j_start; j < j_end; j++) {
+                // CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat();
+                // r_t = -_1_CR;
+                // for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                //     if (t_glob > grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)] && t_glob <= grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)]) {
+                //         jdt = ii_invt;
+                //         r_t = (t_glob - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]) / (grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)] - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]);
+                //         break;
+                //     }
+                // }
+                CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat();
+                r_t = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr+1,i_grid)]*(r_r);
+                    CUSTOMREAL right = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr+1,i_grid)]*(r_r);
+                    if (in_between(t_glob, left, right)) {
+                        jdt = ii_invt;
+                        r_t = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+
+                // continue if t is out of the inversion grid
+                if (r_t < 0) continue;
+
+                for (int i = i_start; i < i_end; i++) {
+                    // CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();
+                    // r_p = -_1_CR;
+                    // for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                    //     if (p_glob > grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)] && p_glob <= grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)]) {
+                    //         idp = ii_invp;
+                    //         r_p = (p_glob - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]) / (grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)] - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]);
+                    //         break;
+                    //     }
+                    // }
+                    CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();
+                    r_p = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(r_r);
+                        CUSTOMREAL right = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(r_r);
+                        if (in_between(p_glob, left, right)) {
+                            idp = ii_invp;
+                            r_p = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    // continue if p is out of the inversion grid
+                    if (r_p < 0) continue;
+
+                    // global grid id to local id
+                    int k_loc = k - grid.get_offset_k();
+                    int j_loc = j - grid.get_offset_j();
+                    int i_loc = i - grid.get_offset_i();
+
+                    // check if *_loc are inside the local subdomain
+                    if (k_loc < 0 || k_loc > loc_K-1) continue;
+                    if (j_loc < 0 || j_loc > loc_J-1) continue;
+                    if (i_loc < 0 || i_loc > loc_I-1) continue;
+
+                    CUSTOMREAL pert_Ks = 0.0;
+                    CUSTOMREAL pert_Keta = 0.0;
+                    CUSTOMREAL pert_Kxi = 0.0;
+
+                    pert_Ks   += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr)];
+                    pert_Keta += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr)];
+                    pert_Kxi  += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr)];
+
+                    pert_Ks    += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr+1)];
+                    pert_Keta  += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr+1)];
+                    pert_Kxi   += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr+1)];
+
+                    pert_Ks    += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr)];
+                    pert_Keta  += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr)];
+                    pert_Kxi   += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr)];
+
+                    pert_Ks    += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr)];
+                    pert_Keta  += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr)];
+                    pert_Kxi   += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr)];
+
+                    pert_Ks    += r_r*r_t*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr+1)];
+                    pert_Keta  += r_r*r_t*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr+1)];
+                    pert_Kxi   += r_r*r_t*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr+1)];
+
+                    pert_Ks    += r_r*(_1_CR-r_t)*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr+1)];
+                    pert_Keta  += r_r*(_1_CR-r_t)*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr+1)];
+                    pert_Kxi   += r_r*(_1_CR-r_t)*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr+1)];
+
+                    pert_Ks    += (_1_CR-r_r)*r_t*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr)];
+                    pert_Keta  += (_1_CR-r_r)*r_t*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr)];
+                    pert_Kxi   += (_1_CR-r_r)*r_t*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr)];
+
+                    pert_Ks    += r_r*r_t*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+                    pert_Keta  += r_r*r_t*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+                    pert_Kxi   += r_r*r_t*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+
+
+                    // update para
+                    grid.Ks_descent_dir_loc[  I2V(i_loc,j_loc,k_loc)] += pert_Ks;
+                    grid.Keta_descent_dir_loc[I2V(i_loc,j_loc,k_loc)] += pert_Keta;
+                    grid.Kxi_descent_dir_loc[ I2V(i_loc,j_loc,k_loc)] += pert_Kxi;
+
+
+                } // end for i
+            } // end for j
+        } // end for k
+
+    } // end i_grid
+
+}
+
+
+
+#endif
--- a/include/smooth_grad_regul.h
+++ b/include/smooth_grad_regul.h
@@ -0,0 +1,321 @@
+#ifndef SMOOTH_GRAD_REGUL_H
+#define SMOOTH_GRAD_REGUL_H
+
+#include <iostream>
+#include "grid.h"
+#include "config.h"
+#include "smooth.h"
+
+inline void smooth_gradient_regularization_CG(Grid& grid, const CUSTOMREAL lr, const CUSTOMREAL lt, const CUSTOMREAL lp) {
+    // smoothing kernels with Conjugate Gradient method
+    // lr,lt,lp: smoothing length in r, theta, phi direction
+
+    // smooth Ks
+    CG_smooth(grid, grid.fun_gradient_regularization_penalty_loc, \
+                    grid.fun_gradient_regularization_penalty_loc, lr, lt, lp);
+    // smooth Keta
+    CG_smooth(grid, grid.eta_gradient_regularization_penalty_loc, \
+                    grid.eta_gradient_regularization_penalty_loc, lr, lt, lp);
+    // smooth Kxi
+    CG_smooth(grid, grid.xi_gradient_regularization_penalty_loc, \
+                    grid.xi_gradient_regularization_penalty_loc, lr, lt, lp);
+
+}
+
+// original method for smoothing kernels
+inline void smooth_gradient_regularization_orig(Grid& grid){
+    // necessary params
+    CUSTOMREAL r_r, r_t, r_p;
+    int kdr, jdt, idp;
+
+    int k_start = 0;
+    int j_start = 0;
+    int i_start = 0;
+    int k_end   = ngrid_k;
+    int j_end   = ngrid_j;
+    int i_end   = ngrid_i;
+
+    //
+    // sum the kernel values on the inversion grid
+    //
+
+    for (int i_grid = 0; i_grid < n_inv_grids; i_grid++) {
+
+        // init coarser kernel arrays
+        for (int k = 0; k < n_inv_K_loc; k++) {
+            for (int j = 0; j < n_inv_J_loc; j++) {
+                for (int i = 0; i < n_inv_I_loc; i++) {
+                    grid.Ks_inv_loc[  I2V_INV_KNL(i,j,k)] = _0_CR;
+                    grid.Keta_inv_loc[I2V_INV_KNL(i,j,k)] = _0_CR;
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(i,j,k)] = _0_CR;
+                }
+            }
+        }
+
+        for (int k = k_start; k < k_end; k++) {
+            // CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            // r_r = -_1_CR;
+            // for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+            //     if (r_glob > grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)] && r_glob <= grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]) {
+            //         kdr = ii_invr;
+            //         r_r = (r_glob - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]) / (grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)] - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]);
+            //         break;
+            //     }
+            // }
+            CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            r_r = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+                // increasing or decreasing order
+                if (in_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr = ii_invr;
+                    r_r = calc_ratio_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+
+            // continue if r is out of the inversion grid
+            if (r_r < 0) continue;
+
+            for (int j = j_start; j < j_end; j++) {
+                // CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat();
+                // r_t = -_1_CR;
+                // for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                //     if (t_glob > grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)] && t_glob <= grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)]) {
+                //         jdt = ii_invt;
+                //         r_t = (t_glob - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]) / (grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)] - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]);
+                //         break;
+                //     }
+                // }
+                CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat(); // global coordinate of t (latitude)
+                r_t = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr+1,i_grid)]*(r_r);
+                    CUSTOMREAL right = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr+1,i_grid)]*(r_r);
+                    if (in_between(t_glob, left, right)) {
+                        jdt = ii_invt;
+                        r_t = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                // continue if t is out of the inversion grid
+                if (r_t < 0) continue;
+
+                for (int i = i_start; i < i_end; i++) {
+                    // CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();
+                    // r_p = -_1_CR;
+                    // for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                    //     if (p_glob > grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)] && p_glob <= grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)]) {
+                    //         idp = ii_invp;
+                    //         r_p = (p_glob - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]) / (grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)] - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]);
+                    //         break;
+                    //     }
+                    // }
+                    CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();    // global coordinate of p (longitude)
+                    r_p = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(r_r);
+                        CUSTOMREAL right = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(r_r);
+                        if (in_between(p_glob, left, right)) {
+                            idp = ii_invp;
+                            r_p = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    // continue if p is out of the inversion grid
+                    if (r_p < 0) continue;
+
+                    // global grid id to local id
+                    int k_loc = k - grid.get_offset_k();
+                    int j_loc = j - grid.get_offset_j();
+                    int i_loc = i - grid.get_offset_i();
+
+                    // check if *_loc are inside the local subdomain
+                    if (k_loc < 0 || k_loc > loc_K-1) continue;
+                    if (j_loc < 0 || j_loc > loc_J-1) continue;
+                    if (i_loc < 0 || i_loc > loc_I-1) continue;
+
+                    // update Ks_inv Keta_inv Kxi_inv
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr+1)] += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr+1)] += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr+1)] += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr)] += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr)] += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr)] += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr)] += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr+1)] += r_r*r_t*(_1_CR-r_p)*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr+1)] += r_r*r_t*(_1_CR-r_p)*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr+1)] += r_r*r_t*(_1_CR-r_p)*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr+1)] += r_r*(_1_CR-r_t)*r_p*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr+1)] += r_r*(_1_CR-r_t)*r_p*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr+1)] += r_r*(_1_CR-r_t)*r_p*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr)] += (_1_CR-r_r)*r_t*r_p*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr)] += (_1_CR-r_r)*r_t*r_p*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr)] += (_1_CR-r_r)*r_t*r_p*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                    grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr+1)] += r_r*r_t*r_p*grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)];
+                    grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr+1)] += r_r*r_t*r_p*grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)];
+                    grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr+1)] += r_r*r_t*r_p*grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)];
+
+                } // end for i
+            } // end for j
+        } // end for k
+
+        // sum over all sub-domains
+        allreduce_cr_inplace(grid.Ks_inv_loc,   n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+        allreduce_cr_inplace(grid.Keta_inv_loc, n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+        allreduce_cr_inplace(grid.Kxi_inv_loc,  n_inv_I_loc*n_inv_J_loc*n_inv_K_loc);
+
+        //
+        // update factors
+        //
+        for (int k = k_start; k < k_end; k++) {
+            // CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            // r_r = -_1_CR;
+            // for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+            //     if (r_glob > grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)] && r_glob <= grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]) {
+            //         kdr = ii_invr;
+            //         r_r = (r_glob - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]) / (grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)] - grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)]);
+            //         break;
+            //     }
+            // }
+            CUSTOMREAL r_glob = grid.get_r_min() + k*grid.get_delta_r(); // global coordinate of r
+            r_r = -_1_CR;
+            for (int ii_invr = 0; ii_invr < n_inv_K_loc-1; ii_invr++){
+                // increasing or decreasing order
+                if (in_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)])) {
+                    kdr = ii_invr;
+                    r_r = calc_ratio_between(r_glob, grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr,i_grid)], grid.inv_grid->r.arr[I2V_INV_GRIDS_1DK(ii_invr+1,i_grid)]);
+                    break;
+                }
+            }
+
+            // continue if r is out of the inversion grid
+            if (r_r < 0) continue;
+
+            for (int j = j_start; j < j_end; j++) {
+                // CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat();
+                // r_t = -_1_CR;
+                // for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                //     if (t_glob > grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)] && t_glob <= grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)]) {
+                //         jdt = ii_invt;
+                //         r_t = (t_glob - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]) / (grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,i_grid)] - grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,i_grid)]);
+                //         break;
+                //     }
+                // }
+                CUSTOMREAL t_glob = grid.get_lat_min() + j*grid.get_delta_lat(); // global coordinate of t (latitude)
+                r_t = -_1_CR;
+                for (int ii_invt = 0; ii_invt < n_inv_J_loc-1; ii_invt++){
+                    CUSTOMREAL left  = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt,  kdr+1,i_grid)]*(r_r);
+                    CUSTOMREAL right = grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->t.arr[I2V_INV_GRIDS_2DJ(ii_invt+1,kdr+1,i_grid)]*(r_r);
+                    if (in_between(t_glob, left, right)) {
+                        jdt = ii_invt;
+                        r_t = calc_ratio_between(t_glob, left, right);
+                        break;
+                    }
+                }
+
+                // continue if t is out of the inversion grid
+                if (r_t < 0) continue;
+
+                for (int i = i_start; i < i_end; i++) {
+                    // CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();
+                    // r_p = -_1_CR;
+                    // for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                    //     if (p_glob > grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)] && p_glob <= grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)]) {
+                    //         idp = ii_invp;
+                    //         r_p = (p_glob - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]) / (grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,i_grid)] - grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,i_grid)]);
+                    //         break;
+                    //     }
+                    // }
+                    CUSTOMREAL p_glob = grid.get_lon_min() + i*grid.get_delta_lon();    // global coordinate of p (longitude)
+                    r_p = -_1_CR;
+                    for (int ii_invp = 0; ii_invp < n_inv_I_loc-1; ii_invp++){
+                        CUSTOMREAL left  = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp,  kdr,i_grid)]*(r_r);
+                        CUSTOMREAL right = grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(1-r_r) + grid.inv_grid->p.arr[I2V_INV_GRIDS_2DI(ii_invp+1,kdr,i_grid)]*(r_r);
+                        if (in_between(p_glob, left, right)) {
+                            idp = ii_invp;
+                            r_p = calc_ratio_between(p_glob, left, right);
+                            break;
+                        }
+                    }
+
+                    // continue if p is out of the inversion grid
+                    if (r_p < 0) continue;
+
+                    // global grid id to local id
+                    int k_loc = k - grid.get_offset_k();
+                    int j_loc = j - grid.get_offset_j();
+                    int i_loc = i - grid.get_offset_i();
+
+                    // check if *_loc are inside the local subdomain
+                    if (k_loc < 0 || k_loc > loc_K-1) continue;
+                    if (j_loc < 0 || j_loc > loc_J-1) continue;
+                    if (i_loc < 0 || i_loc > loc_I-1) continue;
+
+                    CUSTOMREAL pert_Ks = 0.0;
+                    CUSTOMREAL pert_Keta = 0.0;
+                    CUSTOMREAL pert_Kxi = 0.0;
+
+                    pert_Ks   += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr)];
+                    pert_Keta += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr)];
+                    pert_Kxi  += (_1_CR-r_r)*(_1_CR-r_t)*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr)];
+
+                    pert_Ks    += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt,kdr+1)];
+                    pert_Keta  += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt,kdr+1)];
+                    pert_Kxi   += r_r*(_1_CR-r_t)*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt,kdr+1)];
+
+                    pert_Ks    += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr)];
+                    pert_Keta  += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr)];
+                    pert_Kxi   += (_1_CR-r_r)*r_t*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr)];
+
+                    pert_Ks    += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr)];
+                    pert_Keta  += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr)];
+                    pert_Kxi   += (_1_CR-r_r)*(_1_CR-r_t)*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr)];
+
+                    pert_Ks    += r_r*r_t*(_1_CR-r_p)*grid.Ks_inv_loc[  I2V_INV_KNL(idp,jdt+1,kdr+1)];
+                    pert_Keta  += r_r*r_t*(_1_CR-r_p)*grid.Keta_inv_loc[I2V_INV_KNL(idp,jdt+1,kdr+1)];
+                    pert_Kxi   += r_r*r_t*(_1_CR-r_p)*grid.Kxi_inv_loc[ I2V_INV_KNL(idp,jdt+1,kdr+1)];
+
+                    pert_Ks    += r_r*(_1_CR-r_t)*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt,kdr+1)];
+                    pert_Keta  += r_r*(_1_CR-r_t)*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt,kdr+1)];
+                    pert_Kxi   += r_r*(_1_CR-r_t)*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt,kdr+1)];
+
+                    pert_Ks    += (_1_CR-r_r)*r_t*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr)];
+                    pert_Keta  += (_1_CR-r_r)*r_t*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr)];
+                    pert_Kxi   += (_1_CR-r_r)*r_t*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr)];
+
+                    pert_Ks    += r_r*r_t*r_p*grid.Ks_inv_loc[  I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+                    pert_Keta  += r_r*r_t*r_p*grid.Keta_inv_loc[I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+                    pert_Kxi   += r_r*r_t*r_p*grid.Kxi_inv_loc[ I2V_INV_KNL(idp+1,jdt+1,kdr+1)];
+
+
+                    // update para
+                    grid.fun_gradient_regularization_penalty_loc[  I2V(i_loc,j_loc,k_loc)] += pert_Ks;
+                    grid.eta_gradient_regularization_penalty_loc[I2V(i_loc,j_loc,k_loc)] += pert_Keta;
+                    grid.xi_gradient_regularization_penalty_loc[ I2V(i_loc,j_loc,k_loc)] += pert_Kxi;
+
+
+                } // end for i
+            } // end for j
+        } // end for k
+
+    } // end i_grid
+
+}
+
+
+
+#endif
--- a/include/source.fwd.h
+++ b/include/source.fwd.h
@@ -0,0 +1,2 @@
+#pragma once
+class Source;
--- a/include/source.h
+++ b/include/source.h
@@ -0,0 +1,71 @@
+#ifndef SOURCE_H
+#define SOURCE_H
+
+// to handle circular dependency
+#pragma once
+#include "grid.fwd.h"
+#include "source.fwd.h"
+
+#include "input_params.h"
+#include "grid.h"
+#include "config.h"
+#include "mpi_funcs.h"
+
+class Source {
+public:
+    Source(){};
+    ~Source();
+
+    // set source information
+    void set_source_position(InputParams &, Grid &, bool&, const std::string&, bool for_2d_solver=false);
+    //
+    // getters
+    //
+    CUSTOMREAL get_ds_lat(){return dis_src_lat;};
+    CUSTOMREAL get_ds_lon(){return dis_src_lon;};
+    CUSTOMREAL get_ds_r  (){return dis_src_r;};
+    CUSTOMREAL get_src_r(){return src_r;};   // radius
+    CUSTOMREAL get_src_t(){return src_lat;}; // radian
+    CUSTOMREAL get_src_p(){return src_lon;}; // radian
+    CUSTOMREAL get_src_dep(){return radius2depth(src_r);}; // km
+
+    //
+    // parallel getters
+    //
+    CUSTOMREAL get_fac_at_source(CUSTOMREAL*, bool check=false);
+    CUSTOMREAL get_fac_at_point(CUSTOMREAL*, int, int, int);
+
+private:
+    // positions
+    CUSTOMREAL src_lat;
+    CUSTOMREAL src_lon;
+    CUSTOMREAL src_r;
+
+    // discretize source position ids (LOCAL)
+    int i_src_loc;
+    int j_src_loc;
+    int k_src_loc;
+
+    // discretized source position
+    CUSTOMREAL dis_src_lat;
+    CUSTOMREAL dis_src_lon;
+    CUSTOMREAL dis_src_r;
+
+    CUSTOMREAL dis_src_err_r;   // r1
+    CUSTOMREAL dis_src_err_lat; // r2
+    CUSTOMREAL dis_src_err_lon; // r3
+
+    // if source is in this subdomain
+    bool is_in_subdomain = false;
+    int  src_rank;
+    bool *src_flags;
+    int n_dom_src = 0;
+
+    // position error by discretization for each direction
+    CUSTOMREAL error_lon, error_lat, error_r;
+
+    // grid gaps
+    CUSTOMREAL delta_lon, delta_lat, delta_r;
+};
+
+#endif // SOURCE_H
--- a/include/src_rec.h
+++ b/include/src_rec.h
@@ -0,0 +1,211 @@
+#ifndef SRC_REC_H
+#define SRC_REC_H
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <algorithm>
+
+#include "config.h"
+#include "utils.h"
+#include "timer.h"
+#include "mpi_funcs.h"
+
+
+// information of source or receiver
+class SrcRecInfo {
+public:
+    int id = -1;
+    std::string name = "unknown";
+    CUSTOMREAL dep; // stored as depth (km), so need to depth2radious function when using it in other part of the code
+    CUSTOMREAL lat; // stored in degree, but convarted to radian when passed through get_src_* function
+    CUSTOMREAL lon; // stored in degree, but convarted to radian when passed through get_src_* function
+
+    int year             = -1;
+    int month            = -1;
+    int day              = -1;
+    int hour             = -1;
+    int min              = -1;
+    CUSTOMREAL sec       = -1.0;
+    CUSTOMREAL mag       = -1.0;
+
+    CUSTOMREAL adjoint_source = 0.0;
+    CUSTOMREAL adjoint_source_density = 0.0;
+
+    int n_data = 0;
+
+    // arrays for storing arrival times on boundary surfaces, calculated by 2D Eikonal solver
+    bool        is_out_of_region    = false;   // is the source or receiver in the region; false: in the refion; true: teleseismic
+    //CUSTOMREAL* arr_times_bound_N   = nullptr; // arrival time of the receiver at the north boundary of the subdomain
+    //CUSTOMREAL* arr_times_bound_E   = nullptr; // arrival time of the receiver at the east boundary of the subdomain
+    //CUSTOMREAL* arr_times_bound_W   = nullptr; // arrival time of the receiver at the west boundary of the subdomain
+    //CUSTOMREAL* arr_times_bound_S   = nullptr; // arrival time of the receiver at the south boundary of the subdomain
+    //CUSTOMREAL* arr_times_bound_Bot = nullptr; // arrival time of the receiver at the bottom boundary of the subdomain
+    //bool*       is_bound_src;                  // true if the source is on the boundary surface
+
+    // kernel
+    CUSTOMREAL sta_correct = 0.0;
+    CUSTOMREAL sta_correct_kernel = 0.0;
+
+    // relocation
+    bool       is_stop      = false;
+    CUSTOMREAL tau_opt      = 0.0;
+    CUSTOMREAL grad_chi_i   = 0.0;
+    CUSTOMREAL grad_chi_j   = 0.0;
+    CUSTOMREAL grad_chi_k   = 0.0;
+    CUSTOMREAL grad_tau     = 0.0;
+    CUSTOMREAL sum_weight   = 0.0;
+    CUSTOMREAL vobj_src_reloc_old       = 99999999.9;
+    CUSTOMREAL vobj_src_reloc           = 0.0;
+    CUSTOMREAL vobj_grad_norm_src_reloc = 0.0;
+
+    CUSTOMREAL step_length_max  = step_length_src_reloc;  // 2 km default, step length for relocation
+    CUSTOMREAL change_dep = 0.0;
+    CUSTOMREAL change_lat = 0.0;
+    CUSTOMREAL change_lon = 0.0;
+    CUSTOMREAL change_tau = 0.0;
+    int        Ndata      = 0.0;
+
+    bool       is_T_written_into_file = false; 
+};
+
+class DataInfo {
+public:
+
+    CUSTOMREAL data_weight  = 1.0;   // the weight in the src_rec file
+    CUSTOMREAL weight       = 1.0;   // the actual weight in the inversion, equal   data_weight * weight about the data type;
+    CUSTOMREAL weight_reloc = 1.0;   // the actual weight for relocation,   equal   data_weight * weight about the data type;
+
+    std::string phase = "unknown";
+
+    bool dual_data   = false;   // if true, this data is a dual data, used for generating kernel, but not for obj estimation (if true, data type = 2 or 3)
+
+    bool is_src_rec = false; // absolute traveltime, single source - receiver
+
+    // source information
+    int         id_src   = -1;
+    std::string name_src = "unknown";
+
+    // receiver information
+    int         id_rec   = -1;
+    std::string name_rec = "unknown";
+
+    // traveltime
+    CUSTOMREAL travel_time     = -999.0;
+    CUSTOMREAL travel_time_obs = -999.0;
+
+    // receiver pair infomation
+    bool is_rec_pair                       = false;   // common source differential traveltime
+    std::vector<int>         id_rec_pair   = {-1,-1};
+    std::vector<std::string> name_rec_pair = {"unknown","unknown"};
+
+    // common source differential travel time
+    CUSTOMREAL cs_dif_travel_time     = -999.0;
+    CUSTOMREAL cs_dif_travel_time_obs = -999.0;
+
+    // source pair infomation
+    bool                     is_src_pair   = false;   // common receiver differential traveltime (for future)
+    std::vector<int>         id_src_pair   = {-1,-1};
+    std::vector<std::string> name_src_pair = {"unknown","unknown"};
+
+    // common receiver differential travel time
+    CUSTOMREAL cr_dif_travel_time     = -999.0;
+    CUSTOMREAL cr_dif_travel_time_obs = -999.0;
+
+    // source relocation
+    CUSTOMREAL DTi          = 0.0;
+    CUSTOMREAL DTj          = 0.0;
+    CUSTOMREAL DTk          = 0.0;
+
+    std::vector<CUSTOMREAL> DTi_pair = {0.0, 0.0};
+    std::vector<CUSTOMREAL> DTj_pair = {0.0, 0.0};
+    std::vector<CUSTOMREAL> DTk_pair = {0.0, 0.0};
+
+};
+
+
+
+// methods for managing SrcRec objects/lists
+
+// parse src_rec_file
+void parse_src_rec_file(std::string&                      , \
+                        std::map<std::string, SrcRecInfo>&, \
+                        std::map<std::string, SrcRecInfo>&, \
+                        std::map<std::string, std::map<std::string,std::vector<DataInfo>>>&, \
+                        std::vector<std::string>&, \
+                        std::vector<std::vector<std::vector<std::string>>>&);
+
+// parse sta_correctoion_file
+void parse_sta_correction_file(std::string&, \
+                               std::map<std::string, SrcRecInfo>&);
+
+// swap the sources and receivers
+void do_swap_src_rec(std::map<std::string, SrcRecInfo> &, \
+                     std::map<std::string, SrcRecInfo> &, \
+                     std::map<std::string, std::map<std::string, std::vector<DataInfo>>> &, \
+                     std::vector<std::string>&);
+
+// do not swap the sources and receivers
+void do_not_swap_src_rec(std::map<std::string, SrcRecInfo> &, \
+                     std::map<std::string, SrcRecInfo> &, \
+                     std::map<std::string, std::map<std::string, std::vector<DataInfo>>> &, \
+                     std::vector<std::string>&);
+
+// tele seismic source management
+void separate_region_and_tele_src_rec_data(std::map<std::string, SrcRecInfo> &,
+                                           std::map<std::string, SrcRecInfo> &,
+                                           std::map<std::string, std::map<std::string, std::vector<DataInfo>>>&,
+                                           std::map<std::string, SrcRecInfo> &,
+                                           std::map<std::string, SrcRecInfo> &,
+                                           std::map<std::string, std::map<std::string, std::vector<DataInfo>>>&,
+                                           std::map<std::string, SrcRecInfo> &,
+                                           std::map<std::string, SrcRecInfo> &,
+                                           std::map<std::string, std::map<std::string, std::vector<DataInfo>>>&,
+                                           std::map<std::string, int>        &,
+                                           int                               &,
+                                           int                               &,
+                                           int                               &,
+                                           int                               &,
+                                           int                               &,
+                                           const CUSTOMREAL, const CUSTOMREAL,
+                                           const CUSTOMREAL, const CUSTOMREAL,
+                                           const CUSTOMREAL, const CUSTOMREAL,
+                                           bool);
+
+void merge_region_and_tele_src(std::map<std::string, SrcRecInfo> &,
+                               std::map<std::string, SrcRecInfo> &,
+                               std::map<std::string, std::map<std::string,std::vector<DataInfo>>>&,
+                               std::vector<std::string>&,
+                               std::map<std::string, SrcRecInfo> &,
+                               std::map<std::string, SrcRecInfo> &,
+                               std::map<std::string, std::map<std::string,std::vector<DataInfo>>>&);
+
+// distribute srcrec data to all simulation groups
+void distribute_src_rec_data(std::map<std::string, SrcRecInfo>&, \
+                             std::map<std::string, SrcRecInfo>&, \
+                             std::map<std::string, std::map<std::string,std::vector<DataInfo>>>&, \
+                             std::vector<std::string>&, \
+                             std::map<std::string, SrcRecInfo>&, \
+                             std::map<std::string, SrcRecInfo>&, \
+                             std::map<std::string, std::map<std::string,std::vector<DataInfo>>>&, \
+                             std::vector<std::string>&, \
+                             std::vector<std::string>&);
+
+void prepare_src_map_for_2d_solver(std::map<std::string, SrcRecInfo>&, \
+                                   std::map<std::string, SrcRecInfo>&, \
+                                   std::vector<std::string>&, \
+                                   std::map<std::string, SrcRecInfo>&);
+
+void send_src_info_inter_sim(SrcRecInfo&, int);
+void recv_src_info_inter_sim(SrcRecInfo&, int);
+void broadcast_src_info_intra_sim(SrcRecInfo&, int);
+void send_rec_info_inter_sim(SrcRecInfo&, int);
+void recv_rec_info_inter_sim(SrcRecInfo&, int);
+void broadcast_rec_info_intra_sim(SrcRecInfo&, int);
+void send_data_info_inter_sim(DataInfo&, int);
+void recv_data_info_inter_sim(DataInfo&, int);
+
+#endif //SRC_REC_H
--- a/include/timer.h
+++ b/include/timer.h
@@ -0,0 +1,98 @@
+#ifndef TIMER_H
+#define TIMER_H
+
+
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+#include <ctime>
+#include <string>
+#include "config.h"
+
+class Timer {
+public:
+    Timer(std::string name, bool show_start_time = false) : name(name), show_start_time(show_start_time) {
+        start_timer();
+    }
+
+    ~Timer() {}
+
+    std::string name;
+    std::chrono::high_resolution_clock::time_point start;
+    std::chrono::high_resolution_clock::time_point end;
+    std::chrono::duration<double> elapsed_time;
+    bool show_start_time = false;
+
+    void start_timer() {
+        start = std::chrono::high_resolution_clock::now();
+        time_back = start;
+        if (show_start_time && proc_read_srcrec) {
+            std::cout << this->name << " started at " << get_current_utc_time() << std::endl;
+        }
+    }
+
+    void stop_timer() {
+        end = std::chrono::high_resolution_clock::now();
+        elapsed_time = end - start;
+        print_elapsed_time();
+    }
+
+    double get_start() {
+        return std::chrono::duration<double>(start.time_since_epoch()).count();
+    }
+
+    double get_t_delta() {
+        auto current_time = std::chrono::high_resolution_clock::now();
+        auto delta = current_time - time_back;
+        time_back = current_time;
+        return std::chrono::duration<double>(delta).count();
+    }
+
+    double get_t() {
+        return std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - start).count();
+    }
+
+    std::string get_start_t() {
+        return get_current_utc_time(start);
+    }
+
+    std::string get_end_t() {
+        return get_current_utc_time(end);
+    }
+
+    std::string get_elapsed_t() {
+        return std::to_string(elapsed_time.count()) + " sec";
+    }
+
+    std::string get_utc_from_time_t(std::time_t time_t) {
+        std::tm utc_tm = *std::gmtime(&time_t);
+
+        std::ostringstream oss;
+        oss << std::put_time(&utc_tm, "%F %T") << " UTC";
+        return oss.str();
+    }
+
+private:
+    std::chrono::high_resolution_clock::time_point time_back;
+
+    void print_elapsed_time() {
+        if (if_verbose)
+            std::cout << "Time for " << name << ": " << elapsed_time.count() << " sec" << std::endl;
+    }
+
+    std::string get_current_utc_time(std::chrono::high_resolution_clock::time_point time = std::chrono::high_resolution_clock::now()) {
+        std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(time.time_since_epoch());
+        std::time_t time_t = ms.count() / 1000;
+
+        std::tm utc_tm = *std::gmtime(&time_t);
+
+        std::ostringstream oss;
+        oss << std::put_time(&utc_tm, "%F %T") << " UTC";
+        return oss.str();
+    }
+
+};
+
+
+
+#endif // TIMER_H
--- a/include/utils.h
+++ b/include/utils.h
@@ -0,0 +1,401 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <iostream>
+#include <math.h>
+#include <string.h>
+#include <fstream>
+#include <sys/stat.h>
+#include <iomanip>
+#include <sstream>
+#include <cmath>
+#include <new> // for std::bad_alloc
+#include <cstdlib> // for std::exit
+
+
+// chec if compiler is gcc 7.5 or older
+//#if __GNUC__ == 7 && __GNUC_MINOR__ <= 5
+//#include <experimental/filesystem>
+//#define GNUC_7_5
+//#elif __cplusplus > 201402L // compilers supporting c++17
+//#include <filesystem>
+//#endif
+
+#include "config.h"
+
+
+inline int mkpath(std::string s,mode_t mode) {
+    size_t pos=0;
+    std::string dir;
+    int mdret = 0;
+
+    if(s[s.size()-1]!='/'){
+        // force trailing / so we can handle everything in loop
+        s+='/';
+    }
+
+    while((pos=s.find_first_of('/',pos))!=std::string::npos){
+        dir=s.substr(0,pos++);
+        if(dir.size()==0) continue; // if leading / first time is 0 length
+        if((mdret=mkdir(dir.c_str(),mode)) && errno!=EEXIST){
+            return mdret;
+        }
+    }
+    return mdret;
+}
+
+
+
+inline void create_output_dir(std::string dir_path){
+
+    // create output directory if not exists (directories tree)
+    if (world_rank == 0) {
+
+//#if __cplusplus > 201402L && !defined(GNUC_7_5)
+//        // this function requires c++17
+//        if (!std::filesystem::exists(dir_path)){
+//            std::filesystem::create_directories(dir_path);
+//        } else {
+//            std::cout << "Output directory already exists. Overwriting..." << std::endl;
+//        }
+//#else // compilers not supporting std++17
+        // check if directory exists
+        struct stat info;
+        if (stat(dir_path.c_str(), &info) != 0){
+            // directory does not exist
+            // create directory
+            int status = mkpath(dir_path, 0755);
+            if (status != 0){
+                std::cout << "Error: cannot create output directory" << std::endl;
+                std::cout << "Exiting..." << std::endl;
+                exit(1);
+            }
+        } else {
+            // directory exists
+            std::cout << "Output directory already exists. Overwriting..." << std::endl;
+        }
+//#endif
+
+    }
+
+}
+
+
+inline bool is_file_exist(const char* fileName)
+{
+    return static_cast<bool>(std::ifstream(fileName));
+}
+
+
+inline void stdout_by_main(char const* str){
+    if (sim_rank == 0 && inter_sub_rank == 0 && sub_rank == 0 && id_sim == 0)
+        std::cout << str << std::endl;
+}
+
+
+inline void stdout_by_rank_zero(char const* str){
+    if(world_rank == 0)
+        std::cout << str << std::endl;
+}
+
+
+inline void parse_options(int argc, char* argv[]){
+    bool input_file_found = false;
+
+    for (int i = 1; i < argc; i++){
+        if(strcmp(argv[i], "-v") == 0)
+            if_verbose = true;
+        else if (strcmp(argv[i],"-i") == 0){
+            input_file = argv[i+1];
+            input_file_found = true;
+        }
+        else if (strcmp(argv[i], "-h") == 0){
+            stdout_by_main("usage: mpirun -np 4 TOMOATT -i input_params.yaml");
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    // error if input_file is not found
+    if(!input_file_found){
+        stdout_by_main("usage: mpirun -np 4 TOMOATT -i input_params.yaml");
+        std::cout << "Error: input parameter file not found" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+
+inline void parse_options_srcrec_weight(int argc, char* argv[]){
+    bool input_file_found = false;
+
+    for (int i = 1; i < argc; i++){
+        if(strcmp(argv[i], "-v") == 0)
+            if_verbose = true;
+        else if (strcmp(argv[i],"-i") == 0){
+            input_file = argv[i+1];
+            input_file_found = true;
+        } else if (strcmp(argv[i],"-r") == 0){
+            // reference value
+            ref_value = atof(argv[i+1]);
+        } else if (strcmp(argv[i], "-o") == 0){
+            // get output filename
+            output_file_weight = argv[i+1];
+        }
+    }
+
+    // error if input_file is not found
+    if(!input_file_found){
+        stdout_by_main("usage: ./SrcRecWeight -i srcrec_file.txt -r 10.0 -o srcrec_weight.txt");
+        std::cout << argc << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+
+template <typename T>
+inline int check_data_type(T const& data){
+    if (std::is_same<T,bool>::value){
+        return 0;
+    } else if (std::is_same<T, int>::value){
+        return 1;
+    } else if (std::is_same<T, float>::value){
+        return 2;
+    } else if (std::is_same<T, double>::value){
+        return 3;
+    } else {
+        std::cout << "Error: custom real type is not float or double" << std::endl;
+        std::cout << "Please check the definition of CUSTOMREAL in io.h" << std::endl;
+        std::cout << "Exiting..." << std::endl;
+        exit(1);
+    }
+}
+
+
+template <typename T>
+inline T my_square(T const& a){
+    return a * a;
+}
+
+
+// defined function is more than 2 times slower than inline function
+//#define my_square(a) (a * a)
+
+
+template <typename T>
+T* allocateMemory(size_t size, int allocationID) {
+    try {
+        T* ptr = new T[size];
+        return ptr;
+    } catch (const std::bad_alloc& e) {
+        // Handle memory allocation failure
+        std::cerr << "Memory allocation failed for ID " << allocationID << ": " << e.what() << std::endl;
+        std::exit(EXIT_FAILURE); // Exit the program on allocation failure
+    }
+}
+
+
+inline void RLonLat2xyz(CUSTOMREAL lon, CUSTOMREAL lat, CUSTOMREAL R, CUSTOMREAL& x, CUSTOMREAL& y, CUSTOMREAL& z){
+    /*
+    lon : longitude in radian
+    lat : latitude in radian
+    R : radius
+
+    x : x coordinate in m
+    y : y coordinate in m
+    z : z coordinate in m
+    */
+    //x = R*cos(lat*DEG2RAD)*cos(lon*DEG2RAD);
+    //y = R*cos(lat*DEG2RAD)*sin(lon*DEG2RAD);
+    //z = R*sin(lat*DEG2RAD);
+    x = R*cos(lat)*cos(lon);
+    y = R*cos(lat)*sin(lon);
+    z = R*sin(lat);
+
+}
+
+
+// calculate epicentral distance in radian from lon lat in radian
+inline void Epicentral_distance_sphere(const CUSTOMREAL lat1, const CUSTOMREAL lon1, \
+                                       const CUSTOMREAL lat2, const CUSTOMREAL lon2, \
+                                       CUSTOMREAL& dist) {
+
+    // calculate epicentral distance in radian
+    //dist = std::abs(acos(sin(lat1)*sin(lat2)+cos(lat1)*cos(lat2)*cos(lon2-lon1)));
+    CUSTOMREAL lon_dif = lon2 - lon1;
+
+    dist = std::atan2(std::sqrt((my_square(std::cos(lat2) * std::sin(lon_dif)) + \
+                                 my_square(std::cos(lat1) * std::sin(lat2) - std::sin(lat1) * std::cos(lat2) * std::cos(lon_dif)))), \
+                                 std::sin(lat1) * std::sin(lat2) + std::cos(lat1) * std::cos(lat2) * std::cos(lon_dif));
+
+}
+
+
+
+
+//calculate azimuth
+inline void Azimuth_sphere(CUSTOMREAL lat1, CUSTOMREAL lon1, \
+                             CUSTOMREAL lat2, CUSTOMREAL lon2, \
+                             CUSTOMREAL& azi) {
+
+    if (isZero(my_square((lat1-lat2)) \
+        &&   + my_square((lon1-lon2)))){
+        azi = _0_CR;
+    } else {
+        azi = atan2(sin(lon2-lon1)*cos(lat2), \
+                    cos(lat1)*sin(lat2)-sin(lat1)*cos(lat2)*cos(lon2-lon1));
+        if (azi < _0_CR)
+            azi += _2_CR * PI;
+    }
+}
+
+
+inline void WGS84ToCartesian(CUSTOMREAL& lon, CUSTOMREAL& lat, CUSTOMREAL R, \
+                             CUSTOMREAL& x, CUSTOMREAL& y, CUSTOMREAL& z, \
+                             CUSTOMREAL& lon_center, CUSTOMREAL& lat_center){
+
+    // equatorial radius WGS84 major axis
+    const static CUSTOMREAL equRadius = R_earth; // in m in this function
+    const static CUSTOMREAL flattening = 1.0 / 298.257222101;
+
+    const static CUSTOMREAL sqrEccentricity = flattening * (2.0 - flattening);
+
+    const CUSTOMREAL lat_rad = lat - lat_center; // input should be already in radian
+    const CUSTOMREAL lon_rad = lon - lon_center;
+    const CUSTOMREAL alt = R - equRadius; // altitude (height above sea level)
+
+    const CUSTOMREAL sinLat = sin(lat_rad);
+    const CUSTOMREAL cosLat = cos(lat_rad);
+    const CUSTOMREAL sinLon = sin(lon_rad);
+    const CUSTOMREAL cosLon = cos(lon_rad);
+
+    // Normalized radius
+    const CUSTOMREAL normRadius = equRadius / sqrt(1.0 - sqrEccentricity * sinLat * sinLat);
+
+    x = (normRadius + alt) * cosLat * cosLon;
+    y = (normRadius + alt) * cosLat * sinLon;
+    z = (normRadius * (1.0 - sqrEccentricity) + alt) * sinLat;
+
+}
+
+
+template <typename T>
+inline T dot_product(T const* const a, T const* const b, int const& n){
+    CUSTOMREAL result = _0_CR;
+    for (int i = 0; i < n; i++){
+        result += a[i] * b[i];
+    }
+    return result;
+}
+
+template <typename T>
+inline T find_max(T const* const a, int const& n){
+    T max = a[0];
+    for (int i = 1; i < n; i++){
+        if (a[i] > max){
+            max = a[i];
+        }
+    }
+    return max;
+}
+
+template <typename T>
+inline T find_absmax(T const* const a, int const& n){
+    T max = fabs(a[0]);
+    for (int i = 1; i < n; i++){
+        if (fabs(a[i]) > max){
+            max = fabs(a[i]);
+        }
+    }
+    return max;
+}
+
+template <typename T>
+inline T calc_l2norm(T const* const a, int const& n){
+    T result = _0_CR;
+    for (int i = 0; i < n; i++){
+        result += a[i] * a[i];
+    }
+    return result;
+}
+
+
+inline std::string int2string_zero_fill(int i) {
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(4) << i;
+    return ss.str();
+}
+
+
+inline bool in_between(CUSTOMREAL const& a, CUSTOMREAL const& b, CUSTOMREAL const& c){
+    // check if a is between b and c
+    if ((a-b)*(a-c) <= _0_CR){
+        return true;
+    } else {
+        return false;
+    }
+}
+
+
+inline CUSTOMREAL calc_ratio_between(CUSTOMREAL const& a, CUSTOMREAL const& b, CUSTOMREAL const& c){
+    // calculate ratio of a between b and c
+    //if (b < c)
+        return (a - b) / (c - b);
+    //else
+    //    return (a - b) / (b - c);
+}
+
+
+inline void linear_interpolation_1d_sorted(const CUSTOMREAL* x1, const CUSTOMREAL* y1, const int& n1, const CUSTOMREAL* x2, CUSTOMREAL* y2, const int& n2){
+    // linear interpolation for sorted 1d array (monotonely increasing)
+    // x1 : positions of the first array
+    // y1 : values of the first array
+    // n1 : size of the first array
+    // x2 : positions of the second array
+    // y2 : values of the second array
+    // n2 : size of the second array
+    int start = 0;
+    for(int pt2=0; pt2<n2; pt2++){              // x2[pt2] <= x1[0]
+        if (x2[pt2] <= x1[0]){
+            y2[pt2] = y1[0];
+        } else if (x2[pt2] >= x1[n1-1]){    // x2[pt2] >= x1[n1-1]
+            y2[pt2] = y1[n1-1];
+        } else {                            // x1[0] < x2[pt2] < x1[n1-1]
+            for(int pt1=start; pt1<n1-1; pt1++){
+                if (x2[pt2] >= x1[pt1] && x2[pt2] <= x1[pt1+1]){
+                    y2[pt2] = y1[pt1] + (y1[pt1+1] - y1[pt1]) * (x2[pt2] - x1[pt1]) / (x1[pt1+1] - x1[pt1]);
+                    start = pt1;
+                    break;
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+std::vector<CUSTOMREAL> linspace(T start_in, T end_in, int num_in)
+{
+
+  std::vector<CUSTOMREAL> linspaced;
+
+  double start = static_cast<CUSTOMREAL>(start_in);
+  double end = static_cast<CUSTOMREAL>(end_in);
+  double num = static_cast<CUSTOMREAL>(num_in);
+
+  if (num == 0) { return linspaced; }
+  if (num == 1) 
+    {
+      linspaced.push_back(start);
+      return linspaced;
+    }
+
+  double delta = (end - start) / (num - 1);
+
+  for(int i=0; i < num-1; ++i)
+    {
+      linspaced.push_back(start + delta * i);
+    }
+  linspaced.push_back(end); // I want to ensure that start and end
+                            // are exactly the same as the input
+  return linspaced;
+}
+
+#endif // UTILS_H
--- a/include/vectorized_sweep.h
+++ b/include/vectorized_sweep.h
@@ -0,0 +1,825 @@
+#ifndef VECTORIZED_SWEEP_H
+#define VECTORIZED_SWEEP_H
+
+
+#include<vector>
+#include "config.h"
+
+#ifdef USE_SIMD // closed at the end of this file
+#include "simd_conf.h"
+
+#if USE_AVX || USE_AVX512
+__mT COEF      = _mmT_set1_pT(1.0);
+__mT v_1       = _mmT_set1_pT(1.0);
+__mT v_0       = _mmT_set1_pT(0.0);
+__mT v_half    = _mmT_set1_pT(0.5);
+__mT v_2       = _mmT_set1_pT(2.0);
+__mT v_m2      = _mmT_set1_pT(-2.0);
+__mT v_4       = _mmT_set1_pT(4.0);
+__mT v_m3      = _mmT_set1_pT(-3.0);
+__mT coe_max   = _mmT_set1_pT(1e19);
+__mT v_eps     = _mmT_set1_pT(1e-12);
+__mT COEF_TELE = _mmT_set1_pT(1.05);
+
+
+// square of __mT
+inline __mT my_square_v(__mT const& a){
+    return _mmT_mul_pT(a, a);
+}
+
+// pp = (a - b) * Dinv
+inline __mT calc_1d_stencil(__mT const& a, __mT const& b, __mT const& Dinv){
+    return _mmT_mul_pT(_mmT_sub_pT(a,b),Dinv);
+}
+
+/*
+    ww  = _1_CR/(_1_CR+_2_CR*my_square_v((eps + my_square_v(      a \
+                                                       -_2_CR*b \
+                                                       +      c) ) \
+
+                                     / (eps + my_square_v(      d \
+                                                       -_2_CR*a \
+                                                       +      b) )) );
+
+    pp  = sign  * (_1_CR - ww) * (         b \                  <-- sign = +1 or -1
+                                 -         d) * 0.5 * Dinv \
+                         + ww  * ( -3.0 * a \
+                                 +  4.0 * b \
+                                 -  1.0 * c ) * 0.5 * Dinv );
+*/
+inline __mT calc_3d_stencil(__mT const& a, __mT const& b, __mT const&c, __mT const& d, __mT const& Dinv_half, int const& sign){
+
+#ifdef __FMA__
+    // v_eps + square(a - 2.0*b + c)
+    __mT tmp1 = _mmT_add_pT(v_eps,my_square_v(_mmT_add_pT(a,_mmT_fmadd_pT(v_m2,b,c))));
+    // v_eps + square(d - 2.0*a + b)
+    __mT tmp2 = _mmT_add_pT(v_eps,my_square_v(_mmT_add_pT(d,_mmT_fmadd_pT(v_m2,a,b))));
+    // ww = 1.0/(1.0 + 2.0 * square(tmp1/tmp2))
+    __mT ww = _mmT_div_pT(v_1,_mmT_fmadd_pT(v_2,my_square_v(_mmT_div_pT(tmp1,tmp2)),v_1));
+    // pp = sign* ((1.0 - ww) * (b - d) / 2.0 / D
+    //                  + ww  * (-3.0* a + 4.0 * b - c) * Dinv_half)
+    /*
+    */
+    return _mmT_mul_pT(_mmT_set1_pT(sign), \
+                _mmT_add_pT(\
+                            _mmT_mul_pT(_mmT_sub_pT(v_1,ww),_mmT_mul_pT(_mmT_sub_pT(b,d),Dinv_half)),\
+                            _mmT_mul_pT(ww,_mmT_mul_pT(_mmT_sub_pT(_mmT_fmadd_pT(v_4,b,_mmT_mul_pT(v_m3,a)),c),Dinv_half))\
+                )\
+           );
+#else
+    __mT tmp1 = _mmT_add_pT(v_eps,my_square_v(_mmT_add_pT(a,_mmT_add_pT(_mmT_mul_pT(v_m2,b),c))));
+    __mT tmp2 = _mmT_add_pT(v_eps,my_square_v(_mmT_add_pT(d,_mmT_add_pT(_mmT_mul_pT(v_m2,a),b))));
+    __mT ww = _mmT_div_pT(v_1,_mmT_add_pT(v_1,_mmT_mul_pT(v_2,my_square_v(_mmT_div_pT(tmp1,tmp2)))));
+    return _mmT_mul_pT(_mmT_set1_pT(sign), \
+                _mmT_add_pT(\
+                            _mmT_mul_pT(_mmT_sub_pT(v_1,ww),_mmT_mul_pT(_mmT_sub_pT(b,d),Dinv_half)),\
+                            _mmT_mul_pT(ww,_mmT_mul_pT(_mmT_sub_pT(_mmT_add_pT(_mmT_mul_pT(v_4,b),_mmT_mul_pT(v_m3,a)),c),Dinv_half))\
+                )\
+           );
+#endif
+
+}
+
+#elif USE_ARM_SVE
+
+inline __mT my_square_v(svbool_t const& pg, __mT const& a){
+    return svmul_f64_z(pg, a, a);
+}
+
+inline __mT calc_1d_stencil(svbool_t const& pg, __mT const& a, __mT const& b, __mT const& Dinv){
+    return svmul_f64_z(pg, svsub_f64_z(pg, a, b), Dinv);
+}
+
+inline __mT calc_3d_stencil(svbool_t const& pg, __mT const& a, __mT const& b, __mT const&c, __mT const& d, __mT const& Dinv_half, int const& sign){
+
+    __mT v_1     = svdup_f64(1.0);
+    __mT v_2     = svdup_f64(2.0);
+    __mT v_m2    = svdup_f64(-2.0);
+    __mT v_4     = svdup_f64(4.0);
+    __mT v_m3    = svdup_f64(-3.0);
+    __mT v_eps   = svdup_f64(1e-12);
+
+    // v_eps + square(a - 2.0*b + c)
+    //__mT tmp1 = svadd_f64_z(pg, v_eps, my_square_v(pg, svadd_f64_z(pg, a, svadd_f64_z(pg, svmul_f64_z(pg, v_m2, b), c))));
+    __mT tmp1 = svadd_f64_z(pg, v_eps, my_square_v(pg, svadd_f64_z(pg, a, svmad_f64_z(pg, v_m2, b, c))));
+    // v_eps + square(d - 2.0*a + b)
+    //__mT tmp2 = svadd_f64_z(pg, v_eps, my_square_v(pg, svadd_f64_z(pg, d, svadd_f64_z(pg, svmul_f64_z(pg, v_m2, a), b))));
+    __mT tmp2 = svadd_f64_z(pg, v_eps, my_square_v(pg, svadd_f64_z(pg, d, svmad_f64_z(pg, v_m2, a, b))));
+    // ww = 1.0/(1.0 + 2.0 * square(tmp1/tmp2))
+    //__mT ww = svdiv_f64_z(pg, v_1, svadd_f64_z(pg, v_1, svmul_f64_z(pg, v_2, my_square_v(pg, svdiv_f64_z(pg, tmp1, tmp2)))));
+    __mT ww = svdiv_f64_z(pg, v_1, svmad_f64_z(pg, v_2, my_square_v(pg, svdiv_f64_z(pg, tmp1, tmp2)), v_1));
+    // pp = sign* ((1.0 - ww) * (b - d) / 2.0 / D
+    //                  + ww  * (-3.0* a + 4.0 * b - c) * Dinv_half)
+    /*
+    return svmul_f64_m(pg, svdup_f64(sign), \
+            svadd_f64_z(pg, \
+                        svmul_f64_z(pg, svsub_f64_z(pg, v_1, ww), svmul_f64_z(pg, svsub_f64_z(pg, b, d), Dinv_half)),\
+                        svmul_f64_z(pg, ww, svmul_f64_z(pg, svsub_f64_z(pg, svadd_f64_z(pg, svmul_f64_z(pg, v_4, b), svmul_f64_z(pg, v_m3, a)), c), Dinv_half))\
+            )\
+       );
+    */
+    return svmul_f64_m(pg, svdup_f64(sign), \
+                svadd_f64_z(pg, \
+                            svmul_f64_z(pg, svsub_f64_z(pg, v_1, ww), svmul_f64_z(pg, svsub_f64_z(pg, b, d), Dinv_half)),\
+                            svmul_f64_z(pg, ww, svmul_f64_z(pg, svsub_f64_z(pg, svmad_f64_z(pg, v_4, b, svmul_f64_z(pg, v_m3, a)), c), Dinv_half))\
+                )\
+           );
+}
+
+
+#endif
+
+
+
+inline void vect_stencil_1st_pre_simd(
+#if USE_ARM_SVE
+                                      svbool_t const& pg,
+#endif
+                                      __mT const& v_iip, __mT const& v_jjt, __mT const& v_kkr,
+                                      __mT const& v_c__,
+                                      __mT const& v_p__, __mT const& v_m__, __mT const& v__p_, __mT const& v__m_, __mT const& v___p, __mT const& v___m,
+                                      __mT& v_pp1, __mT& v_pp2, __mT& v_pt1, __mT& v_pt2, __mT& v_pr1, __mT& v_pr2,
+                                      __mT const& v_DP_inv, __mT const& v_DT_inv, __mT const& v_DR_inv,
+                                      __mT const& v_DP_inv_half, __mT const& v_DT_inv_half, __mT const& v_DR_inv_half,
+                                      int const& NP, int const& NT, int const& NR){
+
+#if USE_AVX512 || USE_AVX
+
+    v_pp1 = calc_1d_stencil(v_c__, v_m__, v_DP_inv);
+    v_pp2 = calc_1d_stencil(v_p__, v_c__, v_DP_inv);
+    v_pt1 = calc_1d_stencil(v_c__, v__m_, v_DT_inv);
+    v_pt2 = calc_1d_stencil(v__p_, v_c__, v_DT_inv);
+    v_pr1 = calc_1d_stencil(v_c__, v___m, v_DR_inv);
+    v_pr2 = calc_1d_stencil(v___p, v_c__, v_DR_inv);
+
+#elif USE_ARM_SVE
+
+    v_pp1 = calc_1d_stencil(pg, v_c__, v_m__, v_DP_inv);
+    v_pp2 = calc_1d_stencil(pg, v_p__, v_c__, v_DP_inv);
+    v_pt1 = calc_1d_stencil(pg, v_c__, v__m_, v_DT_inv);
+    v_pt2 = calc_1d_stencil(pg, v__p_, v_c__, v_DT_inv);
+    v_pr1 = calc_1d_stencil(pg, v_c__, v___m, v_DR_inv);
+    v_pr2 = calc_1d_stencil(pg, v___p, v_c__, v_DR_inv);
+
+#endif
+
+}
+
+
+inline void vect_stencil_3rd_pre_simd(
+#if USE_ARM_SVE
+                                      svbool_t const& pg,
+#endif
+                                      __mT const& v_iip, __mT const& v_jjt, __mT const& v_kkr,
+                                      __mT const& v_c__,
+                                      __mT const& v_p__,     __mT const& v_m__,     __mT const& v__p_,    __mT const& v__m_,    __mT const& v___p,    __mT const& v___m,
+                                      __mT const& v_pp____,  __mT const& v_mm____,  __mT const& v___pp__, __mT const& v___mm__, __mT const& v_____pp, __mT const& v_____mm,
+                                      __mT& v_pp1,     __mT& v_pp2,     __mT& v_pt1,    __mT& v_pt2,    __mT& v_pr1,    __mT& v_pr2,
+                                      __mT const& v_DP_inv,  __mT const& v_DT_inv,  __mT const& v_DR_inv,
+                                      __mT const& v_DP_inv_half, __mT const& v_DT_inv_half, __mT const& v_DR_inv_half,
+                                      int const& NP, int const& NT, int const& NR){
+
+    const int PLUS  = 1;
+    const int MINUS = -1;
+
+#if USE_AVX512
+
+    __mmaskT mask_i_eq_1         = _mm512_cmp_pT_mask(v_iip, v_1,_CMP_EQ_OQ);    // if iip == 1
+    __mmaskT mask_j_eq_1         = _mm512_cmp_pT_mask(v_jjt, v_1,_CMP_EQ_OQ);    // if jjt == 1
+    __mmaskT mask_k_eq_1         = _mm512_cmp_pT_mask(v_kkr, v_1,_CMP_EQ_OQ);    // if kkr == 1
+    __mmaskT mask_i_eq_N_minus_2 = _mm512_cmp_pT_mask(v_iip, _mmT_set1_pT(NP-2),_CMP_EQ_OQ); // if iip == N-2
+    __mmaskT mask_j_eq_N_minus_2 = _mm512_cmp_pT_mask(v_jjt, _mmT_set1_pT(NT-2),_CMP_EQ_OQ); // if jjt == N-2
+    __mmaskT mask_k_eq_N_minus_2 = _mm512_cmp_pT_mask(v_kkr, _mmT_set1_pT(NR-2),_CMP_EQ_OQ); // if kkr == N-2
+
+    // 1 < iip < N-2
+    __mmaskT mask_i_else = _kand_maskT(
+                               _mm512_cmp_pT_mask(v_iip, v_1                , _CMP_GT_OQ),
+                               _mm512_cmp_pT_mask(_mmT_set1_pT(NP-2),  v_iip, _CMP_GT_OQ)
+                         );
+    // 1 < jjt < N-2
+    __mmaskT mask_j_else = _kand_maskT(
+                               _mm512_cmp_pT_mask(v_jjt, v_1                , _CMP_GT_OQ),
+                               _mm512_cmp_pT_mask(_mmT_set1_pT(NT-2),  v_jjt, _CMP_GT_OQ)
+                         );
+    // 1 < kkr < N-2
+    __mmaskT mask_k_else = _kand_maskT(
+                               _mm512_cmp_pT_mask(v_kkr, v_1                , _CMP_GT_OQ),
+                               _mm512_cmp_pT_mask(_mmT_set1_pT(NR-2),  v_kkr, _CMP_GT_OQ)
+                         );
+
+    // if _i_eq_1 == true
+    v_pp1 = _mm512_mask_blend_pT(mask_i_eq_1, calc_1d_stencil(v_c__, v_m__, v_DP_inv)                            , v_pp1 );
+    v_pp2 = _mm512_mask_blend_pT(mask_i_eq_1, calc_3d_stencil(v_c__, v_p__, v_pp____, v_m__, v_DP_inv_half, PLUS), v_pp2 );
+    // if_i_eq_N_minus_2 == true
+    v_pp1 = _mm512_mask_blend_pT(mask_i_eq_N_minus_2, calc_3d_stencil(v_c__, v_m__, v_mm____, v_p__, v_DP_inv_half, MINUS), v_pp1);
+    v_pp2 = _mm512_mask_blend_pT(mask_i_eq_N_minus_2, calc_1d_stencil(v_p__, v_c__, v_DP_inv)                             , v_pp2);
+    // else
+    v_pp1 = _mm512_mask_blend_pT(mask_i_else, calc_3d_stencil(v_c__, v_m__, v_mm____, v_p__, v_DP_inv_half, MINUS), v_pp1);
+    v_pp2 = _mm512_mask_blend_pT(mask_i_else, calc_3d_stencil(v_c__, v_p__, v_pp____, v_m__, v_DP_inv_half, PLUS) , v_pp2);
+
+    // if _j_eq_1 == true
+    v_pt1 = _mm512_mask_blend_pT(mask_j_eq_1, calc_1d_stencil(v_c__, v__m_, v_DT_inv)                            , v_pt1);
+    v_pt2 = _mm512_mask_blend_pT(mask_j_eq_1, calc_3d_stencil(v_c__, v__p_, v___pp__, v__m_, v_DT_inv_half, PLUS), v_pt2);
+    // if _j_eq_N_minus_2 == true
+    v_pt1 = _mm512_mask_blend_pT(mask_j_eq_N_minus_2, calc_3d_stencil(v_c__, v__m_, v___mm__, v__p_, v_DT_inv_half, MINUS), v_pt1);
+    v_pt2 = _mm512_mask_blend_pT(mask_j_eq_N_minus_2, calc_1d_stencil(v__p_, v_c__, v_DT_inv)                             , v_pt2);
+    // else
+    v_pt1 = _mm512_mask_blend_pT(mask_j_else, calc_3d_stencil(v_c__, v__m_, v___mm__, v__p_, v_DT_inv_half, MINUS), v_pt1);
+    v_pt2 = _mm512_mask_blend_pT(mask_j_else, calc_3d_stencil(v_c__, v__p_, v___pp__, v__m_, v_DT_inv_half, PLUS) , v_pt2);
+
+    // if _k_eq_1 == true
+    v_pr1 = _mm512_mask_blend_pT(mask_k_eq_1, calc_1d_stencil(v_c__, v___m, v_DR_inv)                            , v_pr1 );
+    v_pr2 = _mm512_mask_blend_pT(mask_k_eq_1, calc_3d_stencil(v_c__, v___p, v_____pp, v___m, v_DR_inv_half, PLUS), v_pr2 );
+    // if _k_eq_N_minus_2 == true
+    v_pr1 = _mm512_mask_blend_pT(mask_k_eq_N_minus_2, calc_3d_stencil(v_c__, v___m, v_____mm, v___p, v_DR_inv_half, MINUS), v_pr1);
+    v_pr2 = _mm512_mask_blend_pT(mask_k_eq_N_minus_2, calc_1d_stencil(v___p, v_c__, v_DR_inv)                             , v_pr2);
+    // else
+    v_pr1 = _mm512_mask_blend_pT(mask_k_else, calc_3d_stencil(v_c__, v___m, v_____mm, v___p, v_DR_inv_half, MINUS), v_pr1);
+    v_pr2 = _mm512_mask_blend_pT(mask_k_else, calc_3d_stencil(v_c__, v___p, v_____pp, v___m, v_DR_inv_half, PLUS) , v_pr2);
+
+#elif USE_AVX
+
+    __mT mask_i_eq_1         = _mm256_cmp_pT(v_iip, v_1,_CMP_EQ_OQ);    // if iip == 1
+    __mT mask_j_eq_1         = _mm256_cmp_pT(v_jjt, v_1,_CMP_EQ_OQ);    // if jjt == 1
+    __mT mask_k_eq_1         = _mm256_cmp_pT(v_kkr, v_1,_CMP_EQ_OQ);    // if kkr == 1
+    __mT mask_i_eq_N_minus_2 = _mm256_cmp_pT(v_iip, _mmT_set1_pT(NP-2),_CMP_EQ_OQ); // if iip == N-2
+    __mT mask_j_eq_N_minus_2 = _mm256_cmp_pT(v_jjt, _mmT_set1_pT(NT-2),_CMP_EQ_OQ); // if jjt == N-2
+    __mT mask_k_eq_N_minus_2 = _mm256_cmp_pT(v_kkr, _mmT_set1_pT(NR-2),_CMP_EQ_OQ); // if kkr == N-2
+
+    // 1 < iip < N-2
+    __m256d mask_i_else = _mm256_and_pd(
+                                        _mm256_cmp_pT(v_iip, v_1              , _CMP_GT_OQ),
+                                        _mm256_cmp_pT(_mmT_set1_pT(NP-2),v_iip, _CMP_GT_OQ)
+                         );
+    // 1 < jjt < N-2
+    __m256d mask_j_else = _mm256_and_pd(
+                                        _mm256_cmp_pT(v_jjt, v_1              , _CMP_GT_OQ),
+                                        _mm256_cmp_pT(_mmT_set1_pT(NT-2),v_jjt, _CMP_GT_OQ)
+                         );
+    // 1 < kkr < N-2
+    __m256d mask_k_else = _mm256_and_pd(
+                                        _mm256_cmp_pT(v_kkr, v_1              , _CMP_GT_OQ),
+                                        _mm256_cmp_pT(_mmT_set1_pT(NR-2),v_kkr, _CMP_GT_OQ)
+                         );
+
+    // if _i_eq_1 == true
+    v_pp1 = _mm256_blendv_pd(v_pp1, calc_1d_stencil(v_c__, v_m__, v_DP_inv),                             mask_i_eq_1);
+    v_pp2 = _mm256_blendv_pd(v_pp2, calc_3d_stencil(v_c__, v_p__, v_pp____, v_m__, v_DP_inv_half, PLUS), mask_i_eq_1);
+    // if_i_eq_N_minus_2 == true
+    v_pp1 = _mm256_blendv_pd(v_pp1, calc_3d_stencil(v_c__, v_m__, v_mm____, v_p__, v_DP_inv_half, MINUS), mask_i_eq_N_minus_2);
+    v_pp2 = _mm256_blendv_pd(v_pp2, calc_1d_stencil(v_p__, v_c__, v_DP_inv),                              mask_i_eq_N_minus_2);
+    // else
+    v_pp1 = _mm256_blendv_pd(v_pp1, calc_3d_stencil(v_c__, v_m__, v_mm____, v_p__, v_DP_inv_half, MINUS), mask_i_else);
+    v_pp2 = _mm256_blendv_pd(v_pp2, calc_3d_stencil(v_c__, v_p__, v_pp____, v_m__, v_DP_inv_half, PLUS),  mask_i_else);
+
+    // if _j_eq_1 == true
+    v_pt1 = _mm256_blendv_pd(v_pt1, calc_1d_stencil(v_c__, v__m_, v_DT_inv),                             mask_j_eq_1);
+    v_pt2 = _mm256_blendv_pd(v_pt2, calc_3d_stencil(v_c__, v__p_, v___pp__, v__m_, v_DT_inv_half, PLUS), mask_j_eq_1);
+    // if _j_eq_N_minus_2 == true
+    v_pt1 = _mm256_blendv_pd(v_pt1, calc_3d_stencil(v_c__, v__m_, v___mm__, v__p_, v_DT_inv_half, MINUS), mask_j_eq_N_minus_2);
+    v_pt2 = _mm256_blendv_pd(v_pt2, calc_1d_stencil(v__p_, v_c__, v_DT_inv),                              mask_j_eq_N_minus_2);
+    // else
+    v_pt1 = _mm256_blendv_pd(v_pt1, calc_3d_stencil(v_c__, v__m_, v___mm__, v__p_, v_DT_inv_half, MINUS), mask_j_else);
+    v_pt2 = _mm256_blendv_pd(v_pt2, calc_3d_stencil(v_c__, v__p_, v___pp__, v__m_, v_DT_inv_half, PLUS),  mask_j_else);
+
+    // if _k_eq_1 == true
+    v_pr1 = _mm256_blendv_pd(v_pr1, calc_1d_stencil(v_c__, v___m, v_DR_inv),                             mask_k_eq_1);
+    v_pr2 = _mm256_blendv_pd(v_pr2, calc_3d_stencil(v_c__, v___p, v_____pp, v___m, v_DR_inv_half, PLUS), mask_k_eq_1);
+    // if _k_eq_N_minus_2 == true
+    v_pr1 = _mm256_blendv_pd(v_pr1, calc_3d_stencil(v_c__, v___m, v_____mm, v___p, v_DR_inv_half, MINUS), mask_k_eq_N_minus_2);
+    v_pr2 = _mm256_blendv_pd(v_pr2, calc_1d_stencil(v___p, v_c__, v_DR_inv),                              mask_k_eq_N_minus_2);
+    // else
+    v_pr1 = _mm256_blendv_pd(v_pr1, calc_3d_stencil(v_c__, v___m, v_____mm, v___p, v_DR_inv_half, MINUS), mask_k_else);
+    v_pr2 = _mm256_blendv_pd(v_pr2, calc_3d_stencil(v_c__, v___p, v_____pp, v___m, v_DR_inv_half, PLUS),  mask_k_else);
+
+#elif USE_ARM_SVE
+
+    svfloat64_t v_1 = svdup_f64(1.0);
+
+    svfloat64_t v_NP_minus_2 = svdup_f64(NP-2);
+    svfloat64_t v_NT_minus_2 = svdup_f64(NT-2);
+    svfloat64_t v_NR_minus_2 = svdup_f64(NR-2);
+
+    svbool_t mask_i_eq_1         = svcmpeq_f64(pg, v_iip, v_1);    // if iip == 1
+    svbool_t mask_j_eq_1         = svcmpeq_f64(pg, v_jjt, v_1);    // if jjt == 1
+    svbool_t mask_k_eq_1         = svcmpeq_f64(pg, v_kkr, v_1);    // if kkr == 1
+    svbool_t mask_i_eq_N_minus_2 = svcmpeq_f64(pg, v_iip, v_NP_minus_2); // if iip == N-2
+    svbool_t mask_j_eq_N_minus_2 = svcmpeq_f64(pg, v_jjt, v_NT_minus_2); // if jjt == N-2
+    svbool_t mask_k_eq_N_minus_2 = svcmpeq_f64(pg, v_kkr, v_NR_minus_2); // if kkr == N-2
+
+    // 1 < iip < N-2
+    svbool_t mask_i_else = svand_b_z(pg,
+                               svcmpgt_f64(pg, v_iip, v_1),
+                               svcmplt_f64(pg, v_iip, v_NP_minus_2)
+                         );
+    // 1 < jjt < N-2
+    svbool_t mask_j_else = svand_b_z(pg,
+                               svcmpgt_f64(pg, v_jjt, v_1),
+                               svcmplt_f64(pg, v_jjt, v_NT_minus_2)
+                         );
+    // 1 < kkr < N-2
+    svbool_t mask_k_else = svand_b_z(pg,
+                               svcmpgt_f64(pg, v_kkr, v_1),
+                               svcmplt_f64(pg, v_kkr, v_NR_minus_2)
+                         );
+
+    // if _i_eq_1 == true
+    v_pp1 = svsel_f64(mask_i_eq_1, calc_1d_stencil(mask_i_eq_1, v_c__, v_m__, v_DP_inv)                            , v_pp1);
+    v_pp2 = svsel_f64(mask_i_eq_1, calc_3d_stencil(mask_i_eq_1, v_c__, v_p__, v_pp____, v_m__, v_DP_inv_half, PLUS), v_pp2);
+    // if_i_eq_N_minus_2 == true
+    v_pp1 = svsel_f64(mask_i_eq_N_minus_2, calc_3d_stencil(mask_i_eq_N_minus_2, v_c__, v_m__, v_mm____, v_p__, v_DP_inv_half, MINUS), v_pp1);
+    v_pp2 = svsel_f64(mask_i_eq_N_minus_2, calc_1d_stencil(mask_i_eq_N_minus_2, v_p__, v_c__, v_DP_inv)                             , v_pp2);
+    // else
+    v_pp1 = svsel_f64(mask_i_else, calc_3d_stencil(mask_i_else, v_c__, v_m__, v_mm____, v_p__, v_DP_inv_half, MINUS), v_pp1);
+    v_pp2 = svsel_f64(mask_i_else, calc_3d_stencil(mask_i_else, v_c__, v_p__, v_pp____, v_m__, v_DP_inv_half, PLUS) , v_pp2);
+
+    // if _j_eq_1 == true
+    v_pt1 = svsel_f64(mask_j_eq_1, calc_1d_stencil(mask_j_eq_1, v_c__, v__m_, v_DT_inv)                            , v_pt1);
+    v_pt2 = svsel_f64(mask_j_eq_1, calc_3d_stencil(mask_j_eq_1, v_c__, v__p_, v___pp__, v__m_, v_DT_inv_half, PLUS), v_pt2);
+    // if _j_eq_N_minus_2 == true
+    v_pt1 = svsel_f64(mask_j_eq_N_minus_2, calc_3d_stencil(mask_j_eq_N_minus_2, v_c__, v__m_, v___mm__, v__p_, v_DT_inv_half, MINUS), v_pt1);
+    v_pt2 = svsel_f64(mask_j_eq_N_minus_2, calc_1d_stencil(mask_j_eq_N_minus_2, v__p_, v_c__, v_DT_inv)                             , v_pt2);
+    // else
+    v_pt1 = svsel_f64(mask_j_else, calc_3d_stencil(mask_j_else, v_c__, v__m_, v___mm__, v__p_, v_DT_inv_half, MINUS), v_pt1);
+    v_pt2 = svsel_f64(mask_j_else, calc_3d_stencil(mask_j_else, v_c__, v__p_, v___pp__, v__m_, v_DT_inv_half, PLUS) , v_pt2);
+
+    // if _k_eq_1 == true
+    v_pr1 = svsel_f64(mask_k_eq_1, calc_1d_stencil(mask_k_eq_1, v_c__, v___m, v_DR_inv)                            , v_pr1);
+    v_pr2 = svsel_f64(mask_k_eq_1, calc_3d_stencil(mask_k_eq_1, v_c__, v___p, v_____pp, v___m, v_DR_inv_half, PLUS), v_pr2);
+    // if _k_eq_N_minus_2 == true
+    v_pr1 = svsel_f64(mask_k_eq_N_minus_2, calc_3d_stencil(mask_k_eq_N_minus_2, v_c__, v___m, v_____mm, v___p, v_DR_inv_half, MINUS), v_pr1);
+    v_pr2 = svsel_f64(mask_k_eq_N_minus_2, calc_1d_stencil(mask_k_eq_N_minus_2, v___p, v_c__, v_DR_inv)                             , v_pr2);
+    // else
+    v_pr1 = svsel_f64(mask_k_else, calc_3d_stencil(mask_k_else, v_c__, v___m, v_____mm, v___p, v_DR_inv_half, MINUS), v_pr1);
+    v_pr2 = svsel_f64(mask_k_else, calc_3d_stencil(mask_k_else, v_c__, v___p, v_____pp, v___m, v_DR_inv_half, PLUS) , v_pr2);
+
+
+#endif
+
+}
+
+
+inline void vect_stencil_1st_3rd_apre_simd(
+#if USE_ARM_SVE
+                                           svbool_t const& pg,
+#endif
+                                           __mT& v_tau,        __mT const& v_fac_a,__mT const& v_fac_b, __mT const& v_fac_c, __mT const& v_fac_f,
+                                           __mT const& v_T0v,  __mT const& v_T0p,  __mT const& v_T0t,   __mT const& v_T0r,   __mT const& v_fun, __mT const& v_change,
+                                           __mT const& v_pp1,  __mT const& v_pp2,  __mT const& v_pt1,   __mT const& v_pt2,   __mT const& v_pr1, __mT const& v_pr2,
+                                           __mT const& DP_inv, __mT const& DT_inv, __mT const& DR_inv){
+
+#if USE_AVX512 || USE_AVX
+
+    // sigr = COEF * sqrt(v_fac_a)*v_T0v;
+    __mT sigr = _mmT_mul_pT(_mmT_mul_pT(COEF,_mmT_sqrt_pT(v_fac_a)),v_T0v);
+    // sigt = COEF * sqrt(v_fac_b)*v_T0v;
+    __mT sigt = _mmT_mul_pT(_mmT_mul_pT(COEF,_mmT_sqrt_pT(v_fac_b)),v_T0v);
+    // sigp = COEF * sqrt(v_fac_c)*v_T0v;
+    __mT sigp = _mmT_mul_pT(_mmT_mul_pT(COEF,_mmT_sqrt_pT(v_fac_c)),v_T0v);
+
+    // coe = 1.0 / (sigr/D + sigt/D + sigz/D);
+    __mT coe = _mmT_div_pT(v_1,_mmT_add_pT(_mmT_add_pT(_mmT_mul_pT(sigr,DR_inv),_mmT_mul_pT(sigt,DT_inv)),_mmT_mul_pT(sigp,DP_inv)));
+    // coe becomes inf as sig* goes to 0
+    // if coe > 1e19, set coe = 1e19
+    coe = _mmT_min_pT(coe,coe_max);
+
+    // Htau  = v_fac_a * square(v_T0r * v_tau + v_T0v * (v_pr1 + v_pr2)*0.5);
+    __mT Htau = _mmT_mul_pT(v_fac_a,my_square_v(_mmT_add_pT(_mmT_mul_pT(v_T0r,v_tau),_mmT_mul_pT(v_T0v,_mmT_mul_pT(_mmT_add_pT(v_pr1,v_pr2),v_half)))));
+
+    // Htau += v_fac_b * square(v_T0t * v_tau + v_T0v * (v_pt1 + v_pt2)*0.5))
+    Htau = _mmT_add_pT(Htau,_mmT_mul_pT(v_fac_b,my_square_v(_mmT_add_pT(_mmT_mul_pT(v_T0t,v_tau),_mmT_mul_pT(v_T0v,_mmT_mul_pT(_mmT_add_pT(v_pt1,v_pt2), v_half))))));
+    // Htau += v_fac_c * square(v_T0p * v_tau + v_T0v * (v_pp1 + v_pp2)*0.5))
+    Htau = _mmT_add_pT(Htau,_mmT_mul_pT(v_fac_c,my_square_v(_mmT_add_pT(_mmT_mul_pT(v_T0p,v_tau),_mmT_mul_pT(v_T0v,_mmT_mul_pT(_mmT_add_pT(v_pp1,v_pp2), v_half))))));
+
+    // tmp1 = ( v_T0t * v_tau + v_T0v * (v_pt1 + v_pt2)*0.5)
+    __mT tmp1 = _mmT_add_pT(_mmT_mul_pT(v_T0t,v_tau),_mmT_mul_pT(v_T0v,_mmT_mul_pT(_mmT_add_pT(v_pt1,v_pt2), v_half)));
+    // tmp2 = ( v_T0p * v_tau + v_T0v * (v_pp1 + v_pp2)*0.5))
+    __mT tmp2 = _mmT_add_pT(_mmT_mul_pT(v_T0p,v_tau),_mmT_mul_pT(v_T0v,_mmT_mul_pT(_mmT_add_pT(v_pp1,v_pp2), v_half)));
+    // tmp3 = -2.0 * v_fac_f * tmp1 * tmp2;
+    __mT tmp3 = _mmT_mul_pT(v_m2,_mmT_mul_pT(v_fac_f,_mmT_mul_pT(tmp1,tmp2)));
+
+    // Htau = sqrt(Htau + tmp3); // # becamse nan as Htau - tmp3 goes to 0
+    Htau = _mmT_sqrt_pT(_mmT_add_pT(Htau,tmp3));
+
+    // tmp = (sigr*(v_pr2 - v_pr1) + sigt*(v_pt2 - v_pt1) + sigz*(v_pp2 - v_pp1))*0.5
+    __mT tmp = _mmT_mul_pT(v_half,_mmT_add_pT(_mmT_add_pT(_mmT_mul_pT(sigr,_mmT_sub_pT(v_pr2,v_pr1)),_mmT_mul_pT(sigt,_mmT_sub_pT(v_pt2,v_pt1))),_mmT_mul_pT(sigp,_mmT_sub_pT(v_pp2,v_pp1))));
+
+    // v_tau += coe * ((v_fun - Htau) + tmp) if mask is true
+    v_tau = _mmT_add_pT(v_tau,_mmT_mul_pT(coe,_mmT_add_pT(_mmT_sub_pT(v_fun,Htau),tmp)));
+
+#if USE_AVX512
+    // mask if v_change != 1.0
+    __mmaskT mask = _mm512_cmp_pT_mask(v_change,v_1,_CMP_NEQ_OQ);
+    // set 1 if mask is true
+    v_tau = _mm512_mask_blend_pT(mask,v_tau,v_1);
+#elif USE_AVX
+    __m256d mask = _mm256_cmp_pT(v_change, v_1,_CMP_NEQ_OQ);
+    v_tau        = _mm256_blendv_pd(v_tau, v_1,mask);
+#endif
+
+#elif USE_ARM_SVE
+
+    __mT COEF    = svdup_f64(1.0);
+    __mT v_1     = svdup_f64(1.0);
+    __mT v_half  = svdup_f64(0.5);
+    __mT v_m2    = svdup_f64(-2.0);
+    __mT coe_max = svdup_f64(1e19);
+
+    // sigr = COEF * sqrt(v_fac_a)*v_T0v;
+    __mT sigr = svmul_f64_z(pg, svmul_f64_z(pg,COEF,svsqrt_f64_z(pg,v_fac_a)), v_T0v);
+    // sigt = COEF * sqrt(v_fac_b)*v_T0v;
+    __mT sigt = svmul_f64_z(pg, svmul_f64_z(pg,COEF,svsqrt_f64_z(pg,v_fac_b)), v_T0v);
+    // sigp = COEF * sqrt(v_fac_c)*v_T0v;
+    __mT sigp = svmul_f64_z(pg, svmul_f64_z(pg,COEF,svsqrt_f64_z(pg,v_fac_c)), v_T0v);
+
+    // coe = 1.0 / (sigr/D + sigt/D + sigz/D);
+    __mT coe = svdiv_f64_z(pg, v_1, svadd_f64_z(pg, svadd_f64_z(pg, svmul_f64_z(pg, sigr, DR_inv), svmul_f64_z(pg, sigt, DT_inv)), svmul_f64_z(pg, sigp, DP_inv)));
+    // coe becomes inf as sig* goes to 0
+    // if coe > 1e19, set coe = 1e19
+    coe = svmin_f64_z(pg, coe, coe_max);
+
+    // Htau  = v_fac_a * square(v_T0r * v_tau + v_T0v * (v_pr1 + v_pr2)*0.5);
+    __mT Htau = svmul_f64_z(pg, v_fac_a, my_square_v(pg, svadd_f64_z(pg, \
+                                                         svmul_f64_z(pg, v_T0r, v_tau), \
+                                                         svmul_f64_z(pg, v_T0v, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pr1, v_pr2))))\
+                                                   )\
+                            );
+
+    // Htau += v_fac_b * square(v_T0t * v_tau + v_T0v * (v_pt1 + v_pt2)*0.5))
+    Htau = svadd_f64_z(pg, Htau, svmul_f64_z(pg, v_fac_b, my_square_v(pg, svadd_f64_z(pg, \
+                                                                          svmul_f64_z(pg, v_T0t, v_tau), \
+                                                                          svmul_f64_z(pg, v_T0v, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pt1, v_pt2))))\
+                                                                   )\
+                                            )\
+                      );
+    // Htau += v_fac_c * square(v_T0p * v_tau + v_T0v * (v_pp1 + v_pp2)*0.5))
+    Htau = svadd_f64_z(pg, Htau, svmul_f64_z(pg, v_fac_c, my_square_v(pg, svadd_f64_z(pg, \
+                                                                          svmul_f64_z(pg, v_T0p, v_tau), \
+                                                                          svmul_f64_z(pg, v_T0v, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pp1, v_pp2))))\
+                                                                   )\
+                                            )\
+                      );
+
+    // tmp1 = ( v_T0t * v_tau + v_T0v * (v_pt1 + v_pt2)*0.5)
+    __mT tmp1 = svadd_f64_z(pg, \
+                             svmul_f64_z(pg, v_T0t, v_tau), \
+                             svmul_f64_z(pg, v_T0v, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pt1, v_pt2)))\
+                             );
+    // tmp2 = ( v_T0p * v_tau + v_T0v * (v_pp1 + v_pp2)*0.5))
+    __mT tmp2 = svadd_f64_z(pg, \
+                             svmul_f64_z(pg, v_T0p, v_tau), \
+                             svmul_f64_z(pg, v_T0v, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pp1, v_pp2)))\
+                             );
+    // tmp3 = -2.0 * v_fac_f * tmp1 * tmp2;
+    __mT tmp3 = svmul_f64_z(pg, v_m2, svmul_f64_z(pg, v_fac_f, svmul_f64_z(pg, tmp1, tmp2)));
+
+    // Htau = sqrt(Htau + tmp3); // # becamse nan as Htau - tmp3 goes to 0
+    Htau = svsqrt_f64_z(pg, svadd_f64_z(pg, Htau, tmp3));
+
+    // tmp = (sigr*(v_pr2 - v_pr1) + sigt*(v_pt2 - v_pt1) + sigz*(v_pp2 - v_pp1))*0.5
+    __mT tmp = svmul_f64_z(pg, v_half, svadd_f64_z(pg, \
+                                                   svadd_f64_z(pg, \
+                                                               svmul_f64_z(pg, sigr, svsub_f64_z(pg, v_pr2, v_pr1)), \
+                                                               svmul_f64_z(pg, sigt, svsub_f64_z(pg, v_pt2, v_pt1))\
+                                                               ), \
+                                                   svmul_f64_z(pg, sigp, svsub_f64_z(pg, v_pp2, v_pp1))\
+                                                   )\
+                            );
+
+    // v_tau += coe * ((v_fun - Htau) + tmp) if mask is true
+    v_tau = svadd_f64_z(pg, v_tau, svmul_f64_z(pg, coe, svadd_f64_z(pg, svsub_f64_z(pg, v_fun, Htau), tmp)));
+    // mask = v_change != 1.0
+    svbool_t mask = svcmpne_f64(pg, v_change, v_1);
+    // v_tau = v_1 if mask is true (!= 1.0)
+    v_tau = svsel_f64(mask, v_1, v_tau);
+
+#endif
+
+}
+
+
+
+inline void vect_stencil_1st_3rd_apre_simd_tele(
+#if USE_ARM_SVE
+                                           svbool_t const& pg,
+#endif
+                                           __mT& v_tau,        __mT const& v_fac_a,__mT const& v_fac_b, __mT const& v_fac_c, __mT const& v_fac_f,
+                                           __mT const&  v_fun, __mT const& v_change,
+                                           __mT const& v_pp1,  __mT const& v_pp2,  __mT const& v_pt1,   __mT const& v_pt2,   __mT const& v_pr1, __mT const& v_pr2,
+                                           __mT const& DP_inv, __mT const& DT_inv, __mT const& DR_inv){
+
+#if USE_AVX512 || USE_AVX
+
+    // sigr = COEF * sqrt(v_fac_a);
+    __mT sigr = _mmT_mul_pT(COEF_TELE,_mmT_sqrt_pT(v_fac_a));
+    // sigt = COEF * sqrt(v_fac_b);
+    __mT sigt = _mmT_mul_pT(COEF_TELE,_mmT_sqrt_pT(v_fac_b));
+    // sigp = COEF * sqrt(v_fac_c);
+    __mT sigp = _mmT_mul_pT(COEF_TELE,_mmT_sqrt_pT(v_fac_c));
+
+    // coe = 1.0 / (sigr/D + sigt/D + sigz/D);
+    __mT coe = _mmT_div_pT(v_1,_mmT_add_pT(_mmT_add_pT(_mmT_mul_pT(sigr,DR_inv),_mmT_mul_pT(sigt,DT_inv)),_mmT_mul_pT(sigp,DP_inv)));
+    // coe becomes inf as sig* goes to 0
+    // if coe > 1e19, set coe = 1e19
+    coe = _mmT_min_pT(coe,coe_max);
+
+    // Htau  = v_fac_a * square((v_pr1 + v_pr2)*0.5);
+    __mT Htau = _mmT_mul_pT(v_fac_a,my_square_v(_mmT_mul_pT(_mmT_add_pT(v_pr1,v_pr2),v_half)));
+
+    // Htau += v_fac_b * square((v_pt1 + v_pt2)*0.5))
+    Htau = _mmT_add_pT(Htau,_mmT_mul_pT(v_fac_b,my_square_v(_mmT_mul_pT(_mmT_add_pT(v_pt1,v_pt2), v_half))));
+    // Htau += v_fac_c * square((v_pp1 + v_pp2)*0.5))
+    Htau = _mmT_add_pT(Htau,_mmT_mul_pT(v_fac_c,my_square_v(_mmT_mul_pT(_mmT_add_pT(v_pp1,v_pp2), v_half))));
+
+    // tmp1 = (v_pt1 + v_pt2)*0.5
+    __mT tmp1 = _mmT_mul_pT(_mmT_add_pT(v_pt1,v_pt2), v_half);
+    // tmp2 = (v_pp1 + v_pp2)*0.5
+    __mT tmp2 = _mmT_mul_pT(_mmT_add_pT(v_pp1,v_pp2), v_half);
+    // tmp3 = -2.0 * v_fac_f * tmp1 * tmp2;
+    __mT tmp3 = _mmT_mul_pT(v_m2,_mmT_mul_pT(v_fac_f,_mmT_mul_pT(tmp1,tmp2)));
+
+    // Htau = sqrt(Htau + tmp3); // # becamse nan as Htau - tmp3 goes to 0
+    Htau = _mmT_sqrt_pT(_mmT_add_pT(Htau,tmp3));
+
+    // tmp = (sigr*(v_pr2 - v_pr1) + sigt*(v_pt2 - v_pt1) + sigz*(v_pp2 - v_pp1))*0.5
+    __mT tmp = _mmT_mul_pT(v_half,_mmT_add_pT(_mmT_add_pT(_mmT_mul_pT(sigr,_mmT_sub_pT(v_pr2,v_pr1)),_mmT_mul_pT(sigt,_mmT_sub_pT(v_pt2,v_pt1))),_mmT_mul_pT(sigp,_mmT_sub_pT(v_pp2,v_pp1))));
+
+    // v_tau += coe * ((v_fun - Htau) + tmp) if mask is true
+    v_tau = _mmT_add_pT(v_tau,_mmT_mul_pT(coe,_mmT_add_pT(_mmT_sub_pT(v_fun,Htau),tmp)));
+
+#if USE_AVX512
+    // mask if v_change != 1.0
+    __mmaskT mask = _mm512_cmp_pT_mask(v_change,v_1,_CMP_NEQ_OQ);
+    // set 1 if mask is true
+    v_tau = _mm512_mask_blend_pT(mask,v_tau,v_1);
+#elif USE_AVX
+    __m256d mask = _mm256_cmp_pT(v_change, v_1,_CMP_NEQ_OQ);
+    v_tau        = _mm256_blendv_pd(v_tau, v_1,mask);
+#endif
+
+#elif USE_ARM_SVE
+
+    __mT COEF_TELE = svdup_f64(1.05);
+    __mT v_1       = svdup_f64(1.0);
+    __mT v_half    = svdup_f64(0.5);
+    __mT v_m2      = svdup_f64(-2.0);
+    __mT coe_max   = svdup_f64(1e19);
+
+    // sigr = COEF * sqrt(v_fac_a);
+    __mT sigr = svmul_f64_z(pg,COEF_TELE,svsqrt_f64_z(pg,v_fac_a));
+    // sigt = COEF * sqrt(v_fac_b);
+    __mT sigt = svmul_f64_z(pg,COEF_TELE,svsqrt_f64_z(pg,v_fac_b));
+    // sigp = COEF * sqrt(v_fac_c);
+    __mT sigp = svmul_f64_z(pg,COEF_TELE,svsqrt_f64_z(pg,v_fac_c));
+
+    // coe = 1.0 / (sigr/D + sigt/D + sigz/D);
+    __mT coe = svdiv_f64_z(pg, v_1, svadd_f64_z(pg, svadd_f64_z(pg, svmul_f64_z(pg, sigr, DR_inv), svmul_f64_z(pg, sigt, DT_inv)), svmul_f64_z(pg, sigp, DP_inv)));
+    // coe becomes inf as sig* goes to 0
+    // if coe > 1e19, set coe = 1e19
+    coe = svmin_f64_z(pg, coe, coe_max);
+
+    // Htau  = v_fac_a * square((v_pr1 + v_pr2)*0.5);
+    __mT Htau = svmul_f64_z(pg, v_fac_a, my_square_v(pg, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pr1, v_pr2))));
+    // Htau += v_fac_b * square((v_pt1 + v_pt2)*0.5))
+    Htau = svadd_f64_z(pg, Htau, svmul_f64_z(pg, v_fac_b, my_square_v(pg, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pt1, v_pt2)))));
+    // Htau += v_fac_c * square((v_pp1 + v_pp2)*0.5))
+    Htau = svadd_f64_z(pg, Htau, svmul_f64_z(pg, v_fac_c, my_square_v(pg, svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pp1, v_pp2)))));
+
+    // tmp1 = (v_pt1 + v_pt2)*0.5
+    __mT tmp1 = svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pt1, v_pt2));
+    // tmp2 = (v_pp1 + v_pp2)*0.5
+    __mT tmp2 = svmul_f64_z(pg, v_half, svadd_f64_z(pg, v_pp1, v_pp2));
+    // tmp3 = -2.0 * v_fac_f * tmp1 * tmp2;
+    __mT tmp3 = svmul_f64_z(pg, v_m2, svmul_f64_z(pg, v_fac_f, svmul_f64_z(pg, tmp1, tmp2)));
+
+    // Htau = sqrt(Htau + tmp3); // # becamse nan as Htau - tmp3 goes to 0
+    Htau = svsqrt_f64_z(pg, svadd_f64_z(pg, Htau, tmp3));
+
+    // tmp = (sigr*(v_pr2 - v_pr1) + sigt*(v_pt2 - v_pt1) + sigz*(v_pp2 - v_pp1))*0.5
+    __mT tmp = svmul_f64_z(pg, v_half, svadd_f64_z(pg, \
+                                                   svadd_f64_z(pg, \
+                                                               svmul_f64_z(pg, sigr, svsub_f64_z(pg, v_pr2, v_pr1)), \
+                                                               svmul_f64_z(pg, sigt, svsub_f64_z(pg, v_pt2, v_pt1))\
+                                                               ), \
+                                                   svmul_f64_z(pg, sigp, svsub_f64_z(pg, v_pp2, v_pp1))\
+                                                   )\
+                            );
+
+    // v_tau += coe * ((v_fun - Htau) + tmp) if mask is true
+    v_tau = svadd_f64_z(pg, v_tau, svmul_f64_z(pg, coe, svadd_f64_z(pg, svsub_f64_z(pg, v_fun, Htau), tmp)));
+    // mask = v_change != 1.0
+    svbool_t mask = svcmpne_f64(pg, v_change, v_1);
+    // v_tau = v_1 if mask is true (!= 1.0)
+    v_tau = svsel_f64(mask, v_1, v_tau);
+
+#endif
+
+}
+
+
+// function for calculating the values on boundaries
+inline void calculate_boundary_nodes_tele_simd(
+#if USE_ARM_SVE
+    svbool_t const& pg,
+#endif
+    __mT const& v_iip, __mT const& v_jjt, __mT const& v_kkr,
+    __mT & v_c__,
+    __mT const& v_p__,     __mT const& v_m__,     __mT const& v__p_,    __mT const& v__m_,    __mT const& v___p,    __mT const& v___m,
+    __mT const& v_pp____,  __mT const& v_mm____,  __mT const& v___pp__, __mT const& v___mm__, __mT const& v_____pp, __mT const& v_____mm,
+    __mT const& v_change,
+    int const& loc_I, int const& loc_J, int const& loc_K
+){
+
+#if USE_AVX512
+
+    // mask for bottom boundary (v_kkr == 0 && v_change == 1.0)
+    __mmaskT mask_bot = _kand_maskT(\
+        _mm512_cmp_pT_mask(v_kkr, v_0, _CMP_EQ_OQ), \
+        _mm512_cmp_pT_mask(v_change, v_1, _CMP_EQ_OQ));
+    // mask for top boundary (v_kkr == loc_K-1 && v_change == 1.0)
+    __mmaskT mask_top = _kand_maskT(\
+        _mm512_cmp_pT_mask(v_kkr, _mmT_set1_pT(loc_K-1), _CMP_EQ_OQ), \
+        _mm512_cmp_pT_mask(v_change, v_1, _CMP_EQ_OQ));
+    // mask for south boundary (v_jjt == 0 && v_change == 1.0)
+    __mmaskT mask_south = _kand_maskT(\
+        _mm512_cmp_pT_mask(v_jjt, v_0, _CMP_EQ_OQ), \
+        _mm512_cmp_pT_mask(v_change, v_1, _CMP_EQ_OQ));
+    // mask for north boundary (v_jjt == loc_J-1 && v_change == 1.0)
+    __mmaskT mask_north = _kand_maskT(\
+        _mm512_cmp_pT_mask(v_jjt, _mmT_set1_pT(loc_J-1), _CMP_EQ_OQ), \
+        _mm512_cmp_pT_mask(v_change, v_1, _CMP_EQ_OQ));
+    // mask for west boundary (v_iip == 0 && v_change == 1.0)
+    __mmaskT mask_west = _kand_maskT(\
+        _mm512_cmp_pT_mask(v_iip, v_0, _CMP_EQ_OQ), \
+        _mm512_cmp_pT_mask(v_change, v_1, _CMP_EQ_OQ));
+    // mask for east boundary (v_iip == loc_I-1 && v_change == 1.0)
+    __mmaskT mask_east = _kand_maskT(\
+        _mm512_cmp_pT_mask(v_iip, _mmT_set1_pT(loc_I-1), _CMP_EQ_OQ), \
+        _mm512_cmp_pT_mask(v_change, v_1, _CMP_EQ_OQ));
+
+    // if mask_bot, v_c__ = max(2*v___p - v_____pp, v_____pp)
+    v_c__ = _mm512_mask_blend_pT(mask_bot, v_c__, _mmT_max_pT(_mmT_sub_pT(_mmT_mul_pT(v_2, v___p), v_____pp), v_____pp));
+    // if mask_top, v_c__ = max(2*v___m - v_____mm, v_____mm)
+    v_c__ = _mm512_mask_blend_pT(mask_top, v_c__, _mmT_max_pT(_mmT_sub_pT(_mmT_mul_pT(v_2, v___m), v_____mm), v_____mm));
+    // if mask_south, v_c__ = max(2*v__p_ - v___pp__, v___pp__)
+    v_c__ = _mm512_mask_blend_pT(mask_south, v_c__, _mmT_max_pT(_mmT_sub_pT(_mmT_mul_pT(v_2, v__p_), v___pp__), v___pp__));
+    // if mask_north, v_c__ = max(2*v__m_ - v___mm__, v___mm__)
+    v_c__ = _mm512_mask_blend_pT(mask_north, v_c__, _mmT_max_pT(_mmT_sub_pT(_mmT_mul_pT(v_2, v__m_), v___mm__), v___mm__));
+    // if mask_west, v_c__ = max(2*v_p__ - v_pp____, v_pp____)
+    v_c__ = _mm512_mask_blend_pT(mask_west, v_c__, _mmT_max_pT(_mmT_sub_pT(_mmT_mul_pT(v_2, v_p__), v_pp____), v_pp____));
+    // if mask_east, v_c__ = max(2*v_m__ - v_mm____, v_mm____)
+    v_c__ = _mm512_mask_blend_pT(mask_east, v_c__, _mmT_max_pT(_mmT_sub_pT(_mmT_mul_pT(v_2, v_m__), v_mm____), v_mm____));
+#elif USE_AVX
+
+    // mask for bottom boundary (v_kkr == 0 && v_change == 1.0)
+    __m256d mask_bot = _mm256_and_pd(\
+        _mm256_cmp_pT(v_kkr, v_0, _CMP_EQ_OQ), \
+        _mm256_cmp_pT(v_change, v_1, _CMP_EQ_OQ));
+    // mask for top boundary (v_kkr == loc_K-1 && v_change == 1.0)
+    __m256d mask_top = _mm256_and_pd(\
+        _mm256_cmp_pT(v_kkr, _mmT_set1_pT(loc_K-1), _CMP_EQ_OQ), \
+        _mm256_cmp_pT(v_change, v_1, _CMP_EQ_OQ));
+    // mask for south boundary (v_jjt == 0 && v_change == 1.0)
+    __m256d mask_south = _mm256_and_pd(\
+        _mm256_cmp_pT(v_jjt, v_0, _CMP_EQ_OQ), \
+        _mm256_cmp_pT(v_change, v_1, _CMP_EQ_OQ));
+    // mask for north boundary (v_jjt == loc_J-1 && v_change == 1.0)
+    __m256d mask_north = _mm256_and_pd(\
+        _mm256_cmp_pT(v_jjt, _mmT_set1_pT(loc_J-1), _CMP_EQ_OQ), \
+        _mm256_cmp_pT(v_change, v_1, _CMP_EQ_OQ));
+    // mask for west boundary (v_iip == 0 && v_change == 1.0)
+    __m256d mask_west = _mm256_and_pd(\
+        _mm256_cmp_pT(v_iip, v_0, _CMP_EQ_OQ), \
+        _mm256_cmp_pT(v_change, v_1, _CMP_EQ_OQ));
+    // mask for east boundary (v_iip == loc_I-1 && v_change == 1.0)
+    __m256d mask_east = _mm256_and_pd(\
+        _mm256_cmp_pT(v_iip, _mmT_set1_pT(loc_I-1), _CMP_EQ_OQ), \
+        _mm256_cmp_pT(v_change, v_1, _CMP_EQ_OQ));
+    // if mask_bot, v_c__ = max(2*v___p - v_____pp, v_____pp)
+    v_c__ = _mm256_blendv_pd(v_c__, _mm256_max_pd(_mm256_sub_pd(_mm256_mul_pd(v_2, v___p), v_____pp), v_____pp), mask_bot);
+    // if mask_top, v_c__ = max(2*v___m - v_____mm, v_____mm)
+    v_c__ = _mm256_blendv_pd(v_c__, _mm256_max_pd(_mm256_sub_pd(_mm256_mul_pd(v_2, v___m), v_____mm), v_____mm), mask_top);
+    // if mask_south, v_c__ = max(2*v__p_ - v___pp__, v___pp__)
+    v_c__ = _mm256_blendv_pd(v_c__, _mm256_max_pd(_mm256_sub_pd(_mm256_mul_pd(v_2, v__p_), v___pp__), v___pp__), mask_south);
+    // if mask_north, v_c__ = max(2*v__m_ - v___mm__, v___mm__)
+    v_c__ = _mm256_blendv_pd(v_c__, _mm256_max_pd(_mm256_sub_pd(_mm256_mul_pd(v_2, v__m_), v___mm__), v___mm__), mask_north);
+    // if mask_west, v_c__ = max(2*v_p__ - v_pp____, v_pp____)
+    v_c__ = _mm256_blendv_pd(v_c__, _mm256_max_pd(_mm256_sub_pd(_mm256_mul_pd(v_2, v_p__), v_pp____), v_pp____), mask_west);
+    // if mask_east, v_c__ = max(2*v_m__ - v_mm____, v_mm____)
+    v_c__ = _mm256_blendv_pd(v_c__, _mm256_max_pd(_mm256_sub_pd(_mm256_mul_pd(v_2, v_m__), v_mm____), v_mm____), mask_east);
+
+#elif USE_ARM_SVE
+
+    svfloat64_t v_0 = svdup_f64(0.0);
+    svfloat64_t v_1 = svdup_f64(1.0);
+    svfloat64_t v_2 = svdup_f64(2.0);
+    svfloat64_t v_loc_I_minus_1 = svdup_f64(loc_I-1);
+    svfloat64_t v_loc_J_minus_1 = svdup_f64(loc_J-1);
+    svfloat64_t v_loc_K_minus_1 = svdup_f64(loc_K-1);
+
+
+    // mask for bottom boundary
+    svbool_t mask_bot = svand_b_z(pg, \
+        svcmpeq_f64(pg, v_kkr, v_0), \
+        svcmpeq_f64(pg, v_change, v_1));
+    // mask for top boundary
+    svbool_t mask_top = svand_b_z(pg, \
+        svcmpeq_f64(pg, v_kkr, v_loc_K_minus_1), \
+        svcmpeq_f64(pg, v_change, v_1));
+    // mask for south boundary
+    svbool_t mask_south = svand_b_z(pg, \
+        svcmpeq_f64(pg, v_jjt, v_0), \
+        svcmpeq_f64(pg, v_change, v_1));
+    // mask for north boundary
+    svbool_t mask_north = svand_b_z(pg, \
+        svcmpeq_f64(pg, v_jjt, v_loc_J_minus_1), \
+        svcmpeq_f64(pg, v_change, v_1));
+    // mask for west boundary
+    svbool_t mask_west = svand_b_z(pg, \
+        svcmpeq_f64(pg, v_iip, v_0), \
+        svcmpeq_f64(pg, v_change, v_1));
+    // mask for east boundary
+    svbool_t mask_east = svand_b_z(pg, \
+        svcmpeq_f64(pg, v_iip, v_loc_I_minus_1), \
+        svcmpeq_f64(pg, v_change, v_1));
+
+    // if mask_bot, v_c__ = max(2*v___p - v_____pp, v_____pp)
+    v_c__ = svsel_f64(mask_bot, \
+        svmax_f64_z(pg, svsub_f64_z(pg, svmul_f64_z(pg, v_2, v___p), v_____pp), v_____pp), \
+        v_c__);
+    // if mask_top, v_c__ = max(2*v___m - v_____mm, v_____mm)
+    v_c__ = svsel_f64(mask_top, \
+        svmax_f64_z(pg, svsub_f64_z(pg, svmul_f64_z(pg, v_2, v___m), v_____mm), v_____mm), \
+        v_c__);
+    // if mask_south, v_c__ = max(2*v__p_ - v___pp__, v___pp__)
+    v_c__ = svsel_f64(mask_south, \
+        svmax_f64_z(pg, svsub_f64_z(pg, svmul_f64_z(pg, v_2, v__p_), v___pp__), v___pp__), \
+        v_c__);
+    // if mask_north, v_c__ = max(2*v__m_ - v___mm__, v___mm__)
+    v_c__ = svsel_f64(mask_north, \
+        svmax_f64_z(pg, svsub_f64_z(pg, svmul_f64_z(pg, v_2, v__m_), v___mm__), v___mm__), \
+        v_c__);
+    // if mask_west, v_c__ = max(2*v_p__ - v_pp____, v_pp____)
+    v_c__ = svsel_f64(mask_west, \
+        svmax_f64_z(pg, svsub_f64_z(pg, svmul_f64_z(pg, v_2, v_p__), v_pp____), v_pp____), \
+        v_c__);
+    // if mask_east, v_c__ = max(2*v_m__ - v_mm____, v_mm____)
+    v_c__ = svsel_f64(mask_east, \
+        svmax_f64_z(pg, svsub_f64_z(pg, svmul_f64_z(pg, v_2, v_m__), v_mm____), v_mm____), \
+        v_c__);
+
+#endif
+
+}
+
+
+
+
+
+
+#if USE_AVX512 || USE_AVX
+
+inline __mT load_mem_gen_to_mTd(CUSTOMREAL* a, int* ijk){
+
+        CUSTOMREAL dump_[NSIMD];
+        for (int i=0; i<NSIMD; i++){
+            dump_[i] = a[ijk[i]];
+        }
+
+        return  _mmT_loadu_pT(dump_);
+}
+
+inline __mT load_mem_bool_to_mTd(bool* a, int* ijk){
+
+        CUSTOMREAL dump_[NSIMD];
+        for (int i=0; i<NSIMD; i++){
+            dump_[i] = (CUSTOMREAL) a[ijk[i]];
+        }
+
+       return _mmT_loadu_pT(dump_);
+}
+
+#elif USE_ARM_SVE
+
+inline __mT load_mem_gen_to_mTd(svbool_t const& pg, CUSTOMREAL* a, uint64_t* ijk){
+    svuint64_t v_ijk = svld1_u64(pg, ijk);
+    return svld1_gather_u64index_f64(pg, a, v_ijk);
+}
+
+inline __mT load_mem_bool_to_mTd(svbool_t const& pg, bool* a, int* ijk){
+        CUSTOMREAL dump_[NSIMD];
+
+        for (int i=0; i<NSIMD; i++){
+            dump_[i] = (CUSTOMREAL) a[ijk[i]];
+        }
+
+        return svld1_f64(pg, dump_); // change this to gather load
+}
+
+#endif // __ARM_FEATURE_SVE
+
+#endif // USE_SIMD
+
+
+
+#endif // VECTORIZATED_SWEEP_H
--- a/include/version.h.in
+++ b/include/version.h.in
@@ -0,0 +1,10 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH@
+
+#define GIT_COMMIT "@GIT_COMMIT_HASH@"
+
+#endif // VERSION_H