include/model_update.h

#ifndef MODEL_UPDATE_H
#define MODEL_UPDATE_H

#include <cmath>
#include "config.h"
#include "grid.h"
#include "smooth.h"
#include "smooth_descent_dir.h"
#include "smooth_grad_regul.h"
#include "lbfgs.h"


// generate smoothed kernels (K*_update_loc) from the kernels (K*_loc)
// before doing this, K*_loc should be summed up among all simultaneous runs (by calling sumup_kernels)
// before doing this, K*_update_loc has no meaning (unavailable)
void smooth_kernels(Grid& grid, InputParams& IP) {

    if (subdom_main){ // parallel level 3

        if (id_sim==0){ // parallel level 1
            // initiaize update params
            for (int k = 0; k < loc_K; k++) {
                for (int j = 0; j < loc_J; j++) {
                    for (int i = 0; i < loc_I; i++) {
                        grid.Ks_update_loc_previous[I2V(i,j,k)]   = grid.Ks_update_loc[I2V(i,j,k)];
                        grid.Keta_update_loc_previous[I2V(i,j,k)] = grid.Keta_update_loc[I2V(i,j,k)];
                        grid.Kxi_update_loc_previous[I2V(i,j,k)]  = grid.Kxi_update_loc[I2V(i,j,k)];
                        grid.Ks_update_loc[I2V(i,j,k)]   = _0_CR;
                        grid.Keta_update_loc[I2V(i,j,k)] = _0_CR;
                        grid.Kxi_update_loc[I2V(i,j,k)]  = _0_CR;
                        // grid.Ks_density_update_loc[I2V(i,j,k)]  = _0_CR;
                        // grid.Kxi_density_update_loc[I2V(i,j,k)] = _0_CR;
                        // grid.Keta_density_update_loc[I2V(i,j,k)] = _0_CR;
                    }
                }
            }

            // check kernel
            CUSTOMREAL max_kernel = _0_CR;
            for (int k = 0; k < loc_K; k++) {
                for (int j = 0; j < loc_J; j++) {
                    for (int i = 0; i < loc_I; i++) {
                        max_kernel = std::max(max_kernel, std::abs(grid.Ks_loc[I2V(i,j,k)]));
                    }
                }
            }
            CUSTOMREAL tmp = 0;
            allreduce_cr_single_max(max_kernel, tmp);
            max_kernel = tmp;

            if (max_kernel <= eps) {    
                std::cout << "Error: max_kernel is near zero (less than 10^-12), check data residual and whether no data is used" << std::endl;
                exit(1);
            }

            if (smooth_method == 0) {
                // Ks_loc, Keta_loc, Kxi_loc to be 0 for ghost layers to eliminate the effect of overlapping boundary
                for (int k = 0; k < loc_K; k++) {
                    for (int j = 0; j < loc_J; j++) {
                        for (int i = 0; i < loc_I; i++) {

                            if (i == 0 && !grid.i_first()) {
                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
                            }
                            if (i == loc_I-1 && !grid.i_last()) {
                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
                            }
                            if (j == 0 && !grid.j_first()) {
                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
                            }
                            if (j == loc_J-1 && !grid.j_last()) {
                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
                            }
                            if (k == 0 && !grid.k_first()) {
                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
                            }
                            if (k == loc_K-1 && !grid.k_last()) {
                                grid.Ks_loc[I2V(i,j,k)]         = _0_CR;
                                grid.Keta_loc[I2V(i,j,k)]       = _0_CR;
                                grid.Kxi_loc[I2V(i,j,k)]        = _0_CR;
                                grid.Ks_density_loc[I2V(i,j,k)]     = _0_CR;
                                grid.Kxi_density_loc[I2V(i,j,k)]    = _0_CR;
                                grid.Keta_density_loc[I2V(i,j,k)]   = _0_CR;
                            }
                        }
                    }
                }

                // grid based smoothing
                smooth_inv_kernels_orig(grid, IP);
            } else if (smooth_method == 1) {
                // CG smoothing
                smooth_inv_kernels_CG(grid, smooth_lr, smooth_lt, smooth_lp);
            }

            // shared values on the boundary
            grid.send_recev_boundary_data(grid.Ks_update_loc);
            grid.send_recev_boundary_data(grid.Keta_update_loc);
            grid.send_recev_boundary_data(grid.Kxi_update_loc);
            grid.send_recev_boundary_data(grid.Ks_density_update_loc);
            grid.send_recev_boundary_data(grid.Kxi_density_update_loc);
            grid.send_recev_boundary_data(grid.Keta_density_update_loc);

            grid.send_recev_boundary_data_kosumi(grid.Ks_update_loc);
            grid.send_recev_boundary_data_kosumi(grid.Keta_update_loc);
            grid.send_recev_boundary_data_kosumi(grid.Kxi_update_loc);
            grid.send_recev_boundary_data_kosumi(grid.Ks_density_update_loc);
            grid.send_recev_boundary_data_kosumi(grid.Kxi_density_update_loc);
            grid.send_recev_boundary_data_kosumi(grid.Keta_density_update_loc);

        } // end if id_sim == 0

        // send the updated model to all the simultaneous run
        broadcast_cr_inter_sim(grid.Ks_update_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Kxi_update_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Keta_update_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Ks_density_update_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Kxi_density_update_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Keta_density_update_loc, loc_I*loc_J*loc_K, 0);

        // send the previous updated model to all the simultaneous run
        broadcast_cr_inter_sim(grid.Ks_update_loc_previous, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Kxi_update_loc_previous, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Keta_update_loc_previous, loc_I*loc_J*loc_K, 0);

    } // end if subdom_main

    synchronize_all_world();
}


// smooth gradient regularization term
void smooth_gradient_regularization(Grid& grid) {

    if (subdom_main && id_sim==0){ // only id_sim==0 has values for these arrays
        if (smooth_method == 0) {
            // grid based smoothing
            smooth_gradient_regularization_orig(grid);
        } else if (smooth_method == 1) {
            // CG smoothing
            smooth_gradient_regularization_CG(grid, smooth_lr, smooth_lt, smooth_lp);
        }

        grid.send_recev_boundary_data(grid.fun_gradient_regularization_penalty_loc);
        grid.send_recev_boundary_data(grid.eta_gradient_regularization_penalty_loc);
        grid.send_recev_boundary_data(grid.xi_gradient_regularization_penalty_loc);
    }
}


void smooth_descent_direction(Grid& grid){
    if (subdom_main){
       if (id_sim == 0) { // calculation of the update model is only done in the main simultaneous run
            if (smooth_method == 0) {
                // grid based smoothing
                smooth_descent_dir(grid);
            } else if (smooth_method == 1) {
                // CG smoothing
                smooth_descent_dir_CG(grid, smooth_lr, smooth_lt, smooth_lp);
            }

            // shared values on the boundary
            grid.send_recev_boundary_data(  grid.Ks_descent_dir_loc);
            grid.send_recev_boundary_data(grid.Keta_descent_dir_loc);
            grid.send_recev_boundary_data( grid.Kxi_descent_dir_loc);

        } // end if id_sim == 0

        // send the updated model to all the simultaneous run
        broadcast_cr_inter_sim(  grid.Ks_descent_dir_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim( grid.Kxi_descent_dir_loc, loc_I*loc_J*loc_K, 0);
        broadcast_cr_inter_sim(grid.Keta_descent_dir_loc, loc_I*loc_J*loc_K, 0);

    }
}


void calc_descent_direction(Grid& grid, int i_inv, InputParams& IP) {

    if (subdom_main) {

        // routines for LBFGS
        if (optim_method == LBFGS_MODE) {

            // calculate the descent direction
            if (i_inv > 0) {
                calculate_descent_direction_lbfgs(grid, i_inv);
            // use gradient for the first iteration
            } else {
                int n_grid = loc_I*loc_J*loc_K;

                // first time, descent direction = - precond * gradient
                // inverse the gradient to fit the update scheme for LBFGS
                for (int i = 0; i < n_grid; i++){
                    grid.Ks_descent_dir_loc[i]   = - _1_CR* grid.Ks_update_loc[i];
                    grid.Keta_descent_dir_loc[i] = - _1_CR* grid.Keta_update_loc[i];
                    grid.Kxi_descent_dir_loc[i]  = - _1_CR* grid.Kxi_update_loc[i];
                    //grid.Ks_descent_dir_loc[i]   = - _1_CR* grid.Ks_loc[i];
                    //grid.Keta_descent_dir_loc[i] = - _1_CR* grid.Keta_loc[i];
                    //grid.Kxi_descent_dir_loc[i]  = - _1_CR* grid.Kxi_loc[i];

                }
            }
        } else {
            // return error
            std::cout << "Error: optim_method is not set to LBFGS_MODE (=2)" << std::endl;
            exit(1);
        }


    } // end if subdom_main

    synchronize_all_world();

}

CUSTOMREAL direction_change_of_model_update(Grid& grid){
    CUSTOMREAL norm_grad = _0_CR;
    CUSTOMREAL norm_grad_previous = _0_CR;
    CUSTOMREAL inner_product = _0_CR;
    CUSTOMREAL cos_angle = _0_CR;
    CUSTOMREAL angle = _0_CR;
    if (subdom_main) {
        // initiaize update params
        inner_product      = dot_product(grid.Ks_update_loc_previous, grid.Ks_update_loc, loc_I*loc_J*loc_K);
        norm_grad          = dot_product(grid.Ks_update_loc, grid.Ks_update_loc, loc_I*loc_J*loc_K);
        norm_grad_previous = dot_product(grid.Ks_update_loc_previous, grid.Ks_update_loc_previous, loc_I*loc_J*loc_K);

        CUSTOMREAL tmp;
        allreduce_cr_single(inner_product,tmp);
        inner_product = tmp;

        allreduce_cr_single(norm_grad,tmp);
        norm_grad = tmp;

        allreduce_cr_single(norm_grad_previous,tmp);
        norm_grad_previous = tmp;

        cos_angle = inner_product / (std::sqrt(norm_grad) * std::sqrt(norm_grad_previous));
        angle     = acos(cos_angle) * RAD2DEG;
    }

    return angle;
}


void set_new_model(Grid& grid, CUSTOMREAL step_length_new, bool init_bfgs=false) {

    if (subdom_main) {

        // for LBFGS mode. K*_update_loc is not directly the descent direction but smoothed gradient
        // so for non LBFGS_mode, nextstep will be calculated with K*_update_loc
        if (optim_method != LBFGS_MODE) {
            // // get the scaling factor
            // CUSTOMREAL Linf_Ks = _0_CR, Linf_Keta = _0_CR, Linf_Kxi = _0_CR;
            // for (int k = 1; k < loc_K-1; k++) {
            //     for (int j = 1; j < loc_J-1; j++) {
            //         for (int i = 1; i < loc_I-1; i++) {
            //             Linf_Ks   = std::max(Linf_Ks,   std::abs(grid.Ks_update_loc[I2V(i,j,k)]));
            //             Linf_Keta = std::max(Linf_Keta, std::abs(grid.Keta_update_loc[I2V(i,j,k)]));
            //             Linf_Kxi  = std::max(Linf_Kxi,  std::abs(grid.Kxi_update_loc[I2V(i,j,k)]));
            //         }
            //     }
            // }

            // // get the maximum scaling factor among subdomains
            // CUSTOMREAL Linf_tmp;
            // allreduce_cr_single_max(Linf_Ks, Linf_tmp);   Linf_Ks = Linf_tmp;
            // allreduce_cr_single_max(Linf_Keta, Linf_tmp); Linf_Keta = Linf_tmp;
            // allreduce_cr_single_max(Linf_Kxi, Linf_tmp);  Linf_Kxi = Linf_tmp;

            // CUSTOMREAL Linf_all = _0_CR;
            // Linf_all = std::max(Linf_Ks, std::max(Linf_Keta, Linf_Kxi));

            // // if (myrank == 0 && id_sim == 0)
            // //    std::cout << "Scaling factor for all kernels: " << Linf_all << std::endl;
            // //    std::cout << "Scaling factor for model update for Ks, Keta, Kx, stepsize: " << Linf_Ks << ", " << Linf_Keta << ", " << Linf_Kxi << ", " << step_length_new << std::endl;


            // Linf_Ks = Linf_all;
            // Linf_Keta = Linf_all;
            // Linf_Kxi = Linf_all;

            // kernel update has been rescaled in "smooth_inv_kernels_orig"

            // update the model
            for (int k = 0; k < loc_K; k++) {
                for (int j = 0; j < loc_J; j++) {
                    for (int i = 0; i < loc_I; i++) {

                        // // update
                        // grid.fun_loc[I2V(i,j,k)] *= (_1_CR - grid.Ks_update_loc[I2V(i,j,k)  ] / (Linf_Ks   /step_length_new) );
                        // grid.xi_loc[I2V(i,j,k)]  -=          grid.Kxi_update_loc[I2V(i,j,k) ] / (Linf_Kxi  /step_length_new)  ;
                        // grid.eta_loc[I2V(i,j,k)] -=          grid.Keta_update_loc[I2V(i,j,k)] / (Linf_Keta /step_length_new)  ;

                        // update
                        grid.fun_loc[I2V(i,j,k)] *= (_1_CR - grid.Ks_update_loc[I2V(i,j,k)  ] * step_length_new);
                        grid.xi_loc[I2V(i,j,k)]  -=          grid.Kxi_update_loc[I2V(i,j,k) ] * step_length_new;
                        grid.eta_loc[I2V(i,j,k)] -=          grid.Keta_update_loc[I2V(i,j,k)] * step_length_new;


                        // grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];
                        // grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];
                        // grid.fac_f_loc[I2V(i,j,k)] =       - _2_CR * grid.eta_loc[I2V(i,j,k)];

                        // grid.fun_loc[I2V(i,j,k)] *= (_1_CR);
                        // grid.xi_loc[I2V(i,j,k)]  -= (_0_CR);
                        // grid.eta_loc[I2V(i,j,k)] -= (_0_CR);

                        // grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];
                        // grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];
                        // grid.fac_f_loc[I2V(i,j,k)] =       - _2_CR * grid.eta_loc[I2V(i,j,k)];

                    }
                }
            }

            grid.rejuvenate_abcf();


        } else { // for LBFGS routine

            // here all the simultaneous runs have the same values used in this routine.
            // thus we don't need to if(id_sim==0)

            CUSTOMREAL step_length = step_length_new;
            const CUSTOMREAL factor = - _1_CR;

            // update the model
            for (int k = 0; k < loc_K; k++) {
                for (int j = 0; j < loc_J; j++) {
                    for (int i = 0; i < loc_I; i++) {
                        // update
                        grid.fun_loc[I2V(i,j,k)] *= (_1_CR - factor * grid.Ks_descent_dir_loc[I2V(i,j,k)]   * step_length);
                        grid.xi_loc[I2V(i,j,k)]  -=          factor * grid.Kxi_descent_dir_loc[I2V(i,j,k) ] * step_length;
                        grid.eta_loc[I2V(i,j,k)] -=          factor * grid.Keta_descent_dir_loc[I2V(i,j,k)] * step_length;
                        //grid.fun_loc[I2V(i,j,k)] += grid.Ks_descent_dir_loc[I2V(i,j,k)]  *step_length_new;
                        //grid.xi_loc[I2V(i,j,k)]  += grid.Kxi_descent_dir_loc[I2V(i,j,k)] *step_length_new;
                        //grid.eta_loc[I2V(i,j,k)] += grid.Keta_descent_dir_loc[I2V(i,j,k)]*step_length_new;

                        grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];
                        grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];
                        grid.fac_f_loc[I2V(i,j,k)] =       - _2_CR * grid.eta_loc[I2V(i,j,k)];
                    }
                }
            }

        } // end LBFGS model update

        // shared values on the boundary
        grid.send_recev_boundary_data(grid.fun_loc);
        grid.send_recev_boundary_data(grid.xi_loc);
        grid.send_recev_boundary_data(grid.eta_loc);
        grid.send_recev_boundary_data(grid.fac_b_loc);
        grid.send_recev_boundary_data(grid.fac_c_loc);
        grid.send_recev_boundary_data(grid.fac_f_loc);

    } // end if subdom_main
}


// compute p_k * grad(f_k)
CUSTOMREAL compute_q_k(Grid& grid) {

    CUSTOMREAL tmp_qk = _0_CR;

    if (subdom_main && id_sim == 0) {

        // grad * descent_direction
        for (int k = 0; k < loc_K; k++) {
            for (int j = 0; j < loc_J; j++) {
                for (int i = 0; i < loc_I; i++) {
                    // we can add here a precondition (here use *_update_loc for storing descent direction)
                    tmp_qk += grid.Ks_update_loc[I2V(i,j,k)]   * grid.Ks_descent_dir_loc[I2V(i,j,k)]
                            + grid.Keta_update_loc[I2V(i,j,k)] * grid.Keta_descent_dir_loc[I2V(i,j,k)]
                            + grid.Kxi_update_loc[I2V(i,j,k)]  * grid.Kxi_descent_dir_loc[I2V(i,j,k)];
                }
            }
        }

        // add tmp_qk among all subdomain
        allreduce_cr_single(tmp_qk, tmp_qk);
    }

    // share tmp_qk among all simultaneous runs
    if (subdom_main)
        broadcast_cr_single_inter_sim(tmp_qk,0);

    return tmp_qk;

}


// check if the wolfe conditions are satisfied
bool check_wolfe_cond(Grid& grid, \
                    CUSTOMREAL q_0, CUSTOMREAL q_t, \
                    CUSTOMREAL qp_0, CUSTOMREAL qp_t, \
                    CUSTOMREAL& td, CUSTOMREAL& tg, CUSTOMREAL& step_length_sub) {
    /*
    q_0 : initial cost function
    q_t : current cost function
    q_p0 : initial grad * descent_dir
    q_pt : current grad * descent_dir
    td : right step size
    tg : left step size
    step_length_sub : current step size
    */

    bool step_accepted = false;

    // check  if the wolfe conditions are satisfied and update the step_length_sub
    CUSTOMREAL qpt = (q_t - q_0) / step_length_sub;

    if (subdom_main) {
        // good step size
        if (qpt   <= wolfe_c1*qp_0 \
         && wolfe_c2*qp_0 <= qp_t \
         ){
            if (myrank==0)
                std::cout << "Wolfe rules:  step accepted" << std::endl;
            step_accepted = true;
        } else {
            // modify the stepsize
            if (wolfe_c1*qp_0 < qpt) {
                td = step_length_sub;
                if (myrank==0)
                    std::cout << "Wolfe rules:  right step size updated." << std::endl;
            }
            if (qpt <= wolfe_c1*qp_0 && qp_t < wolfe_c2*qp_0) {
                tg = step_length_sub;
                if (myrank==0)
                    std::cout << "Wolfe rules:  left step size updated." << std::endl;
            }
            //if (isZero(td)) {
            if (td == _0_CR) {
                step_length_sub = _2_CR * step_length_sub;
                if (myrank==0)
                    std::cout << "Wolfe rules:  step size too small. Increased to " << step_length_sub << std::endl;
            } else {
                step_length_sub = _0_5_CR * (td+tg);
                if (myrank==0)
                    std::cout << "Wolfe rules:  step size too large. Decreased to " << step_length_sub << std::endl;
            }
        }

        // share step_accepted among all subdomains
        broadcast_bool_single(step_accepted,0);
        broadcast_cr_single(step_length_sub,0);
    } // end if subdom_main

    broadcast_bool_single_sub(step_accepted,0);
    broadcast_cr_single_sub(step_length_sub,0);

    return step_accepted;
}


#endif // MODEL_UPDATE_H
initial upload 2025-12-17 10:53:43 +08:00			`#ifndef MODEL_UPDATE_H`
			`#define MODEL_UPDATE_H`

			`#include <cmath>`
			`#include "config.h"`
			`#include "grid.h"`
			`#include "smooth.h"`
			`#include "smooth_descent_dir.h"`
			`#include "smooth_grad_regul.h"`
			`#include "lbfgs.h"`


			`// generate smoothed kernels (K_update_loc) from the kernels (K_loc)`
			`// before doing this, K*_loc should be summed up among all simultaneous runs (by calling sumup_kernels)`
			`// before doing this, K*_update_loc has no meaning (unavailable)`
			`void smooth_kernels(Grid& grid, InputParams& IP) {`

			`if (subdom_main){ // parallel level 3`

			`if (id_sim==0){ // parallel level 1`
			`// initiaize update params`
			`for (int k = 0; k < loc_K; k++) {`
			`for (int j = 0; j < loc_J; j++) {`
			`for (int i = 0; i < loc_I; i++) {`
			`grid.Ks_update_loc_previous[I2V(i,j,k)] = grid.Ks_update_loc[I2V(i,j,k)];`
			`grid.Keta_update_loc_previous[I2V(i,j,k)] = grid.Keta_update_loc[I2V(i,j,k)];`
			`grid.Kxi_update_loc_previous[I2V(i,j,k)] = grid.Kxi_update_loc[I2V(i,j,k)];`
			`grid.Ks_update_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_update_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_update_loc[I2V(i,j,k)] = _0_CR;`
			`// grid.Ks_density_update_loc[I2V(i,j,k)] = _0_CR;`
			`// grid.Kxi_density_update_loc[I2V(i,j,k)] = _0_CR;`
			`// grid.Keta_density_update_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`}`
			`}`

			`// check kernel`
			`CUSTOMREAL max_kernel = _0_CR;`
			`for (int k = 0; k < loc_K; k++) {`
			`for (int j = 0; j < loc_J; j++) {`
			`for (int i = 0; i < loc_I; i++) {`
			`max_kernel = std::max(max_kernel, std::abs(grid.Ks_loc[I2V(i,j,k)]));`
			`}`
			`}`
			`}`
			`CUSTOMREAL tmp = 0;`
			`allreduce_cr_single_max(max_kernel, tmp);`
			`max_kernel = tmp;`

			`if (max_kernel <= eps) {`
			`std::cout << "Error: max_kernel is near zero (less than 10^-12), check data residual and whether no data is used" << std::endl;`
			`exit(1);`
			`}`

			`if (smooth_method == 0) {`
			`// Ks_loc, Keta_loc, Kxi_loc to be 0 for ghost layers to eliminate the effect of overlapping boundary`
			`for (int k = 0; k < loc_K; k++) {`
			`for (int j = 0; j < loc_J; j++) {`
			`for (int i = 0; i < loc_I; i++) {`

			`if (i == 0 && !grid.i_first()) {`
			`grid.Ks_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Ks_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_density_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`if (i == loc_I-1 && !grid.i_last()) {`
			`grid.Ks_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Ks_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_density_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`if (j == 0 && !grid.j_first()) {`
			`grid.Ks_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Ks_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_density_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`if (j == loc_J-1 && !grid.j_last()) {`
			`grid.Ks_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Ks_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_density_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`if (k == 0 && !grid.k_first()) {`
			`grid.Ks_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Ks_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_density_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`if (k == loc_K-1 && !grid.k_last()) {`
			`grid.Ks_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Ks_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Kxi_density_loc[I2V(i,j,k)] = _0_CR;`
			`grid.Keta_density_loc[I2V(i,j,k)] = _0_CR;`
			`}`
			`}`
			`}`
			`}`

			`// grid based smoothing`
			`smooth_inv_kernels_orig(grid, IP);`
			`} else if (smooth_method == 1) {`
			`// CG smoothing`
			`smooth_inv_kernels_CG(grid, smooth_lr, smooth_lt, smooth_lp);`
			`}`

			`// shared values on the boundary`
			`grid.send_recev_boundary_data(grid.Ks_update_loc);`
			`grid.send_recev_boundary_data(grid.Keta_update_loc);`
			`grid.send_recev_boundary_data(grid.Kxi_update_loc);`
			`grid.send_recev_boundary_data(grid.Ks_density_update_loc);`
			`grid.send_recev_boundary_data(grid.Kxi_density_update_loc);`
			`grid.send_recev_boundary_data(grid.Keta_density_update_loc);`

			`grid.send_recev_boundary_data_kosumi(grid.Ks_update_loc);`
			`grid.send_recev_boundary_data_kosumi(grid.Keta_update_loc);`
			`grid.send_recev_boundary_data_kosumi(grid.Kxi_update_loc);`
			`grid.send_recev_boundary_data_kosumi(grid.Ks_density_update_loc);`
			`grid.send_recev_boundary_data_kosumi(grid.Kxi_density_update_loc);`
			`grid.send_recev_boundary_data_kosumi(grid.Keta_density_update_loc);`

			`} // end if id_sim == 0`

			`// send the updated model to all the simultaneous run`
			`broadcast_cr_inter_sim(grid.Ks_update_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Kxi_update_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Keta_update_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Ks_density_update_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Kxi_density_update_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Keta_density_update_loc, loc_Iloc_Jloc_K, 0);`

			`// send the previous updated model to all the simultaneous run`
			`broadcast_cr_inter_sim(grid.Ks_update_loc_previous, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Kxi_update_loc_previous, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Keta_update_loc_previous, loc_Iloc_Jloc_K, 0);`

			`} // end if subdom_main`

			`synchronize_all_world();`
			`}`


			`// smooth gradient regularization term`
			`void smooth_gradient_regularization(Grid& grid) {`

			`if (subdom_main && id_sim==0){ // only id_sim==0 has values for these arrays`
			`if (smooth_method == 0) {`
			`// grid based smoothing`
			`smooth_gradient_regularization_orig(grid);`
			`} else if (smooth_method == 1) {`
			`// CG smoothing`
			`smooth_gradient_regularization_CG(grid, smooth_lr, smooth_lt, smooth_lp);`
			`}`

			`grid.send_recev_boundary_data(grid.fun_gradient_regularization_penalty_loc);`
			`grid.send_recev_boundary_data(grid.eta_gradient_regularization_penalty_loc);`
			`grid.send_recev_boundary_data(grid.xi_gradient_regularization_penalty_loc);`
			`}`
			`}`


			`void smooth_descent_direction(Grid& grid){`
			`if (subdom_main){`
			`if (id_sim == 0) { // calculation of the update model is only done in the main simultaneous run`
			`if (smooth_method == 0) {`
			`// grid based smoothing`
			`smooth_descent_dir(grid);`
			`} else if (smooth_method == 1) {`
			`// CG smoothing`
			`smooth_descent_dir_CG(grid, smooth_lr, smooth_lt, smooth_lp);`
			`}`

			`// shared values on the boundary`
			`grid.send_recev_boundary_data( grid.Ks_descent_dir_loc);`
			`grid.send_recev_boundary_data(grid.Keta_descent_dir_loc);`
			`grid.send_recev_boundary_data( grid.Kxi_descent_dir_loc);`

			`} // end if id_sim == 0`

			`// send the updated model to all the simultaneous run`
			`broadcast_cr_inter_sim( grid.Ks_descent_dir_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim( grid.Kxi_descent_dir_loc, loc_Iloc_Jloc_K, 0);`
			`broadcast_cr_inter_sim(grid.Keta_descent_dir_loc, loc_Iloc_Jloc_K, 0);`

			`}`
			`}`


			`void calc_descent_direction(Grid& grid, int i_inv, InputParams& IP) {`

			`if (subdom_main) {`

			`// routines for LBFGS`
			`if (optim_method == LBFGS_MODE) {`

			`// calculate the descent direction`
			`if (i_inv > 0) {`
			`calculate_descent_direction_lbfgs(grid, i_inv);`
			`// use gradient for the first iteration`
			`} else {`
			`int n_grid = loc_Iloc_Jloc_K;`

			`// first time, descent direction = - precond * gradient`
			`// inverse the gradient to fit the update scheme for LBFGS`
			`for (int i = 0; i < n_grid; i++){`
			`grid.Ks_descent_dir_loc[i] = - _1_CR* grid.Ks_update_loc[i];`
			`grid.Keta_descent_dir_loc[i] = - _1_CR* grid.Keta_update_loc[i];`
			`grid.Kxi_descent_dir_loc[i] = - _1_CR* grid.Kxi_update_loc[i];`
			`//grid.Ks_descent_dir_loc[i] = - _1_CR* grid.Ks_loc[i];`
			`//grid.Keta_descent_dir_loc[i] = - _1_CR* grid.Keta_loc[i];`
			`//grid.Kxi_descent_dir_loc[i] = - _1_CR* grid.Kxi_loc[i];`

			`}`
			`}`
			`} else {`
			`// return error`
			`std::cout << "Error: optim_method is not set to LBFGS_MODE (=2)" << std::endl;`
			`exit(1);`
			`}`


			`} // end if subdom_main`

			`synchronize_all_world();`

			`}`

			`CUSTOMREAL direction_change_of_model_update(Grid& grid){`
			`CUSTOMREAL norm_grad = _0_CR;`
			`CUSTOMREAL norm_grad_previous = _0_CR;`
			`CUSTOMREAL inner_product = _0_CR;`
			`CUSTOMREAL cos_angle = _0_CR;`
			`CUSTOMREAL angle = _0_CR;`
			`if (subdom_main) {`
			`// initiaize update params`
			`inner_product = dot_product(grid.Ks_update_loc_previous, grid.Ks_update_loc, loc_Iloc_Jloc_K);`
			`norm_grad = dot_product(grid.Ks_update_loc, grid.Ks_update_loc, loc_Iloc_Jloc_K);`
			`norm_grad_previous = dot_product(grid.Ks_update_loc_previous, grid.Ks_update_loc_previous, loc_Iloc_Jloc_K);`

			`CUSTOMREAL tmp;`
			`allreduce_cr_single(inner_product,tmp);`
			`inner_product = tmp;`

			`allreduce_cr_single(norm_grad,tmp);`
			`norm_grad = tmp;`

			`allreduce_cr_single(norm_grad_previous,tmp);`
			`norm_grad_previous = tmp;`

			`cos_angle = inner_product / (std::sqrt(norm_grad) * std::sqrt(norm_grad_previous));`
			`angle = acos(cos_angle) * RAD2DEG;`
			`}`

			`return angle;`
			`}`


			`void set_new_model(Grid& grid, CUSTOMREAL step_length_new, bool init_bfgs=false) {`

			`if (subdom_main) {`

			`// for LBFGS mode. K*_update_loc is not directly the descent direction but smoothed gradient`
			`// so for non LBFGS_mode, nextstep will be calculated with K*_update_loc`
			`if (optim_method != LBFGS_MODE) {`
			`// // get the scaling factor`
			`// CUSTOMREAL Linf_Ks = _0_CR, Linf_Keta = _0_CR, Linf_Kxi = _0_CR;`
			`// for (int k = 1; k < loc_K-1; k++) {`
			`// for (int j = 1; j < loc_J-1; j++) {`
			`// for (int i = 1; i < loc_I-1; i++) {`
			`// Linf_Ks = std::max(Linf_Ks, std::abs(grid.Ks_update_loc[I2V(i,j,k)]));`
			`// Linf_Keta = std::max(Linf_Keta, std::abs(grid.Keta_update_loc[I2V(i,j,k)]));`
			`// Linf_Kxi = std::max(Linf_Kxi, std::abs(grid.Kxi_update_loc[I2V(i,j,k)]));`
			`// }`
			`// }`
			`// }`

			`// // get the maximum scaling factor among subdomains`
			`// CUSTOMREAL Linf_tmp;`
			`// allreduce_cr_single_max(Linf_Ks, Linf_tmp); Linf_Ks = Linf_tmp;`
			`// allreduce_cr_single_max(Linf_Keta, Linf_tmp); Linf_Keta = Linf_tmp;`
			`// allreduce_cr_single_max(Linf_Kxi, Linf_tmp); Linf_Kxi = Linf_tmp;`

			`// CUSTOMREAL Linf_all = _0_CR;`
			`// Linf_all = std::max(Linf_Ks, std::max(Linf_Keta, Linf_Kxi));`

			`// // if (myrank == 0 && id_sim == 0)`
			`// // std::cout << "Scaling factor for all kernels: " << Linf_all << std::endl;`
			`// // std::cout << "Scaling factor for model update for Ks, Keta, Kx, stepsize: " << Linf_Ks << ", " << Linf_Keta << ", " << Linf_Kxi << ", " << step_length_new << std::endl;`


			`// Linf_Ks = Linf_all;`
			`// Linf_Keta = Linf_all;`
			`// Linf_Kxi = Linf_all;`

			`// kernel update has been rescaled in "smooth_inv_kernels_orig"`

			`// update the model`
			`for (int k = 0; k < loc_K; k++) {`
			`for (int j = 0; j < loc_J; j++) {`
			`for (int i = 0; i < loc_I; i++) {`

			`// // update`
			`// grid.fun_loc[I2V(i,j,k)] *= (_1_CR - grid.Ks_update_loc[I2V(i,j,k) ] / (Linf_Ks /step_length_new) );`
			`// grid.xi_loc[I2V(i,j,k)] -= grid.Kxi_update_loc[I2V(i,j,k) ] / (Linf_Kxi /step_length_new) ;`
			`// grid.eta_loc[I2V(i,j,k)] -= grid.Keta_update_loc[I2V(i,j,k)] / (Linf_Keta /step_length_new) ;`

			`// update`
			`grid.fun_loc[I2V(i,j,k)] = (_1_CR - grid.Ks_update_loc[I2V(i,j,k) ] step_length_new);`
			`grid.xi_loc[I2V(i,j,k)] -= grid.Kxi_update_loc[I2V(i,j,k) ] * step_length_new;`
			`grid.eta_loc[I2V(i,j,k)] -= grid.Keta_update_loc[I2V(i,j,k)] * step_length_new;`


			`// grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];`
			`// grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];`
			`// grid.fac_f_loc[I2V(i,j,k)] = - _2_CR * grid.eta_loc[I2V(i,j,k)];`

			`// grid.fun_loc[I2V(i,j,k)] *= (_1_CR);`
			`// grid.xi_loc[I2V(i,j,k)] -= (_0_CR);`
			`// grid.eta_loc[I2V(i,j,k)] -= (_0_CR);`

			`// grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];`
			`// grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];`
			`// grid.fac_f_loc[I2V(i,j,k)] = - _2_CR * grid.eta_loc[I2V(i,j,k)];`

			`}`
			`}`
			`}`

			`grid.rejuvenate_abcf();`


			`} else { // for LBFGS routine`

			`// here all the simultaneous runs have the same values used in this routine.`
			`// thus we don't need to if(id_sim==0)`

			`CUSTOMREAL step_length = step_length_new;`
			`const CUSTOMREAL factor = - _1_CR;`

			`// update the model`
			`for (int k = 0; k < loc_K; k++) {`
			`for (int j = 0; j < loc_J; j++) {`
			`for (int i = 0; i < loc_I; i++) {`
			`// update`
			`grid.fun_loc[I2V(i,j,k)] = (_1_CR - factor grid.Ks_descent_dir_loc[I2V(i,j,k)] * step_length);`
			`grid.xi_loc[I2V(i,j,k)] -= factor * grid.Kxi_descent_dir_loc[I2V(i,j,k) ] * step_length;`
			`grid.eta_loc[I2V(i,j,k)] -= factor * grid.Keta_descent_dir_loc[I2V(i,j,k)] * step_length;`
			`//grid.fun_loc[I2V(i,j,k)] += grid.Ks_descent_dir_loc[I2V(i,j,k)] *step_length_new;`
			`//grid.xi_loc[I2V(i,j,k)] += grid.Kxi_descent_dir_loc[I2V(i,j,k)] *step_length_new;`
			`//grid.eta_loc[I2V(i,j,k)] += grid.Keta_descent_dir_loc[I2V(i,j,k)]*step_length_new;`

			`grid.fac_b_loc[I2V(i,j,k)] = _1_CR - _2_CR * grid.xi_loc[I2V(i,j,k)];`
			`grid.fac_c_loc[I2V(i,j,k)] = _1_CR + _2_CR * grid.xi_loc[I2V(i,j,k)];`
			`grid.fac_f_loc[I2V(i,j,k)] = - _2_CR * grid.eta_loc[I2V(i,j,k)];`
			`}`
			`}`
			`}`

			`} // end LBFGS model update`

			`// shared values on the boundary`
			`grid.send_recev_boundary_data(grid.fun_loc);`
			`grid.send_recev_boundary_data(grid.xi_loc);`
			`grid.send_recev_boundary_data(grid.eta_loc);`
			`grid.send_recev_boundary_data(grid.fac_b_loc);`
			`grid.send_recev_boundary_data(grid.fac_c_loc);`
			`grid.send_recev_boundary_data(grid.fac_f_loc);`

			`} // end if subdom_main`
			`}`



			`// compute p_k * grad(f_k)`
			`CUSTOMREAL compute_q_k(Grid& grid) {`

			`CUSTOMREAL tmp_qk = _0_CR;`

			`if (subdom_main && id_sim == 0) {`

			`// grad * descent_direction`
			`for (int k = 0; k < loc_K; k++) {`
			`for (int j = 0; j < loc_J; j++) {`
			`for (int i = 0; i < loc_I; i++) {`
			`// we can add here a precondition (here use *_update_loc for storing descent direction)`
			`tmp_qk += grid.Ks_update_loc[I2V(i,j,k)] * grid.Ks_descent_dir_loc[I2V(i,j,k)]`
			`+ grid.Keta_update_loc[I2V(i,j,k)] * grid.Keta_descent_dir_loc[I2V(i,j,k)]`
			`+ grid.Kxi_update_loc[I2V(i,j,k)] * grid.Kxi_descent_dir_loc[I2V(i,j,k)];`
			`}`
			`}`
			`}`

			`// add tmp_qk among all subdomain`
			`allreduce_cr_single(tmp_qk, tmp_qk);`
			`}`

			`// share tmp_qk among all simultaneous runs`
			`if (subdom_main)`
			`broadcast_cr_single_inter_sim(tmp_qk,0);`

			`return tmp_qk;`

			`}`


			`// check if the wolfe conditions are satisfied`
			`bool check_wolfe_cond(Grid& grid, \`
			`CUSTOMREAL q_0, CUSTOMREAL q_t, \`
			`CUSTOMREAL qp_0, CUSTOMREAL qp_t, \`
			`CUSTOMREAL& td, CUSTOMREAL& tg, CUSTOMREAL& step_length_sub) {`
			`/*`
			`q_0 : initial cost function`
			`q_t : current cost function`
			`q_p0 : initial grad * descent_dir`
			`q_pt : current grad * descent_dir`
			`td : right step size`
			`tg : left step size`
			`step_length_sub : current step size`
			`*/`

			`bool step_accepted = false;`

			`// check if the wolfe conditions are satisfied and update the step_length_sub`
			`CUSTOMREAL qpt = (q_t - q_0) / step_length_sub;`

			`if (subdom_main) {`
			`// good step size`
			`if (qpt <= wolfe_c1*qp_0 \`
			`&& wolfe_c2*qp_0 <= qp_t \`
			`){`
			`if (myrank==0)`
			`std::cout << "Wolfe rules: step accepted" << std::endl;`
			`step_accepted = true;`
			`} else {`
			`// modify the stepsize`
			`if (wolfe_c1*qp_0 < qpt) {`
			`td = step_length_sub;`
			`if (myrank==0)`
			`std::cout << "Wolfe rules: right step size updated." << std::endl;`
			`}`
			`if (qpt <= wolfe_c1qp_0 && qp_t < wolfe_c2qp_0) {`
			`tg = step_length_sub;`
			`if (myrank==0)`
			`std::cout << "Wolfe rules: left step size updated." << std::endl;`
			`}`
			`//if (isZero(td)) {`
			`if (td == _0_CR) {`
			`step_length_sub = _2_CR * step_length_sub;`
			`if (myrank==0)`
			`std::cout << "Wolfe rules: step size too small. Increased to " << step_length_sub << std::endl;`
			`} else {`
			`step_length_sub = _0_5_CR * (td+tg);`
			`if (myrank==0)`
			`std::cout << "Wolfe rules: step size too large. Decreased to " << step_length_sub << std::endl;`
			`}`
			`}`

			`// share step_accepted among all subdomains`
			`broadcast_bool_single(step_accepted,0);`
			`broadcast_cr_single(step_length_sub,0);`
			`} // end if subdom_main`

			`broadcast_bool_single_sub(step_accepted,0);`
			`broadcast_cr_single_sub(step_length_sub,0);`

			`return step_accepted;`
			`}`



			`#endif // MODEL_UPDATE_H`