/******************************************************** * ██████╗ ██████╗████████╗██╗ * ██╔════╝ ██╔════╝╚══██╔══╝██║ * ██║ ███╗██║ ██║ ██║ * ██║ ██║██║ ██║ ██║ * ╚██████╔╝╚██████╗ ██║ ███████╗ * ╚═════╝ ╚═════╝ ╚═╝ ╚══════╝ * Geophysical Computational Tools & Library (GCTL) * * Copyright (c) 2022 Yi Zhang (yizhang-geo@zju.edu.cn) * * GCTL is distributed under a dual licensing scheme. You can redistribute * it and/or modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation, either version 2 * of the License, or (at your option) any later version. You should have * received a copy of the GNU Lesser General Public License along with this * program. If not, see . * * If the terms and conditions of the LGPL v.2. would prevent you from using * the GCTL, please consider the option to obtain a commercial license for a * fee. These licenses are offered by the GCTL's original author. As a rule, * licenses are provided "as-is", unlimited in time for a one time fee. Please * send corresponding requests to: yizhang-geo@zju.edu.cn. Please do not forget * to include some description of your company and the realm of its activities. * Also add information on how to contact you by electronic and paper mail. ******************************************************/ #include "lbfgs.h" #include "typeinfo" //宏函数 比较几个数的大小 #define min2(a, b) ((a) <= (b) ? (a) : (b)) #define max2(a, b) ((a) >= (b) ? (a) : (b)) #define max3(a, b, c) max2(max2((a), (b)), (c)); #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.) //迭代数据类型 struct iteration_data_t { double alpha; gctl::array s; /* [n] */ gctl::array y; /* [n] */ double ys; /* vecdot(y, s) y与s的点积 */ }; // 试算测试步长 int update_trial_interval(double *x, double *fx, double *dx, double *y, double *fy, double *dy, double *t, double *ft, double *dt, const double tmin, const double tmax, int *brackt); // 计算x的L1模长 double owlqn_x1norm(const gctl::array &x, const int start, const int n); // 计算似模长 void owlqn_pseudo_gradient(gctl::array &pg, const gctl::array &x, const gctl::array &g, const int n, const double c, const int start, const int end); // 计算模长投影 void owlqn_project(gctl::array &d, const gctl::array &sign, const int start, const int end); // 默认的迭代参数 static const gctl::lbfgs_para lbfgs_defparam = { 6, 1e-5, 0, 1e-5, 1e-8, 0, gctl::LBFGS_LINESEARCH_DEFAULT, 40, 1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16, 0.0, 0, -1, }; gctl::lbfgs_solver::lbfgs_solver() { lbfgs_param_ = lbfgs_defparam; lbfgs_silent_ = false; } gctl::lbfgs_solver::~lbfgs_solver(){} void gctl::lbfgs_solver::lbfgs_silent() { lbfgs_silent_ = true; return; } void gctl::lbfgs_solver::LBFGS_Precondition(const array &x, const array &g, const array &d, array &d_pre) { throw std::runtime_error("The preconditioning function 'gctl::lbfgs_solver::LBFGS_Precondition(const array &x, const array &g, const array &d, array &d_pre)' must be defined before using."); return; } int gctl::lbfgs_solver::LBFGS_Progress(const array &x, const array &g, const double fx, const double converge, const double rate, const lbfgs_para param, int k, int ls, std::ostream &ss) { std::string ss_str = typeid(ss).name(); if (ss_str.find("ofstream") != std::string::npos) { if ((converge <= param.epsilon || fx <= param.residual) && lbfgs_silent_ == false) { ss << "Times: " << k << ", F(x) = " << fx << ", Eps. = " << converge << ", "; if (param.past > 0) ss << "Delta = " << rate << ", "; } } else if (!lbfgs_silent_) { ss << GCTL_CLEARLINE << "\rTimes: " << k << ", F(x) = " << fx << ", Eps. = " << converge; if (param.past > 0) ss << ", Delta = " << rate; } return 0; } void gctl::lbfgs_solver::set_lbfgs_para(const lbfgs_para &in_param) { lbfgs_param_ = in_param; return; } void gctl::lbfgs_solver::set_lbfgs_para(const toml::value &toml_data) { lbfgs_param_ = lbfgs_defparam; std::string LBFGS = "lbfgs"; if (toml_data.contains(LBFGS)) { if (toml_data.at(LBFGS).contains("m")) lbfgs_param_.m = toml::find(toml_data, LBFGS, "m"); if (toml_data.at(LBFGS).contains("epsilon")) lbfgs_param_.epsilon = toml::find(toml_data, LBFGS, "epsilon"); if (toml_data.at(LBFGS).contains("past")) lbfgs_param_.past = toml::find(toml_data, LBFGS, "past"); if (toml_data.at(LBFGS).contains("delta")) lbfgs_param_.delta = toml::find(toml_data, LBFGS, "delta"); if (toml_data.at(LBFGS).contains("residual")) lbfgs_param_.residual = toml::find(toml_data, LBFGS, "residual"); if (toml_data.at(LBFGS).contains("max_iterations")) lbfgs_param_.max_iterations = toml::find(toml_data, LBFGS, "max_iterations"); if (toml_data.at(LBFGS).contains("max_linesearch")) lbfgs_param_.max_linesearch = toml::find(toml_data, LBFGS, "max_linesearch"); if (toml_data.at(LBFGS).contains("min_step")) lbfgs_param_.min_step = toml::find(toml_data, LBFGS, "min_step"); if (toml_data.at(LBFGS).contains("max_step")) lbfgs_param_.max_step = toml::find(toml_data, LBFGS, "max_step"); if (toml_data.at(LBFGS).contains("ftol")) lbfgs_param_.ftol = toml::find(toml_data, LBFGS, "ftol"); if (toml_data.at(LBFGS).contains("wolfe")) lbfgs_param_.wolfe = toml::find(toml_data, LBFGS, "wolfe"); if (toml_data.at(LBFGS).contains("gtol")) lbfgs_param_.gtol = toml::find(toml_data, LBFGS, "gtol"); if (toml_data.at(LBFGS).contains("xtol")) lbfgs_param_.xtol = toml::find(toml_data, LBFGS, "xtol"); if (toml_data.at(LBFGS).contains("orthantwise_c")) lbfgs_param_.orthantwise_c = toml::find(toml_data, LBFGS, "orthantwise_c"); if (toml_data.at(LBFGS).contains("orthantwise_start")) lbfgs_param_.orthantwise_start = toml::find(toml_data, LBFGS, "orthantwise_start"); if (toml_data.at(LBFGS).contains("orthantwise_end")) lbfgs_param_.orthantwise_end = toml::find(toml_data, LBFGS, "orthantwise_end"); if (toml_data.at(LBFGS).contains("linesearch")) { std::string l_str = toml::find(toml_data, LBFGS, "linesearch"); if (l_str == "LBFGS_LINESEARCH_DEFAULT") lbfgs_param_.linesearch = LBFGS_LINESEARCH_DEFAULT; else if (l_str == "LBFGS_LINESEARCH_MORETHUENTE") lbfgs_param_.linesearch = LBFGS_LINESEARCH_MORETHUENTE; else if (l_str == "LBFGS_LINESEARCH_BACKTRACKING_ARMIJO") lbfgs_param_.linesearch = LBFGS_LINESEARCH_BACKTRACKING_ARMIJO; else if (l_str == "LBFGS_LINESEARCH_BACKTRACKING") lbfgs_param_.linesearch = LBFGS_LINESEARCH_BACKTRACKING; else if (l_str == "LBFGS_LINESEARCH_BACKTRACKING_WOLFE") lbfgs_param_.linesearch = LBFGS_LINESEARCH_BACKTRACKING_WOLFE; else if (l_str == "LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE") lbfgs_param_.linesearch = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE; else if (l_str == "LBFGS_LINESEARCH_BACKTRACKING_ARMIJO_QUAD") lbfgs_param_.linesearch = LBFGS_LINESEARCH_BACKTRACKING_ARMIJO_QUAD; else throw std::runtime_error("[FMM_INVERSION_2D] Invalid lbfgs' linesearching algorithm."); } } return; } void gctl::lbfgs_solver::lbfgs_error_str(lbfgs_return_code err_code, std::ostream &ss, bool err_throw) { std::string ss_str = typeid(ss).name(); #if defined _WINDOWS || __WIN32__ if (!err_throw) { if (ss_str.find("ofstream") != std::string::npos) { if (err_code >= 0) ss << "L-BFGS Success! "; else ss << "L-BFGS Fail! "; } else { if (er_index >= 0) { SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_GREEN); ss << "L-BFGS Success! "; } else { SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), FOREGROUND_INTENSITY | FOREGROUND_RED); ss << "L-BFGS Fail! "; } } } #else if (!err_throw) { if (ss_str.find("ofstream") != std::string::npos) { if (err_code >= 0) ss << "L-BFGS Success! "; else ss << "L-BFGS Fail! "; } else { if (err_code >= 0) ss << "\033[1m\033[32mL-BFGS Success! "; else ss << "\033[1m\033[31mL-BFGS Fail! "; } } #endif std::string err_str; switch(err_code) { case LBFGS_EPS_CONVERGENCE: err_str = "Reached convergence (epsilon)."; break; case LBFGS_DELTA_CONVERGENCE: err_str = "Reached convergence (delta)."; break; case LBFGS_RESI_CONVERGENCE: err_str = "Reached convergence (residual)."; break; case LBFGS_STOP: err_str = "Met stopping criteria."; break; case LBFGS_ALREADY_MINIMIZED: err_str = "The initial variables already minimize the objective function."; break; case LBFGSERR_UNKNOWNERROR: err_str = "Unknown error."; break; case LBFGSERR_LOGICERROR: err_str = "Logic error."; break; case LBFGSERR_OUTOFMEMORY: err_str = "Insufficient memory."; break; case LBFGSERR_CANCELED: err_str = "The minimization process has been canceled."; break; case LBFGSERR_INVALID_N: err_str = "Invalid number of variables specified."; break; case LBFGSERR_INVALID_N_SSE: err_str = "Invalid number of variables (for SSE) specified."; break; case LBFGSERR_INVALID_X_SSE: err_str = "The array x must be aligned to 16 (for SSE)."; break; case LBFGSERR_INVALID_EPSILON: err_str = "Invalid parameter lbfgs_para::epsilon specified."; break; case LBFGSERR_INVALID_TESTPERIOD: err_str = "Invalid parameter lbfgs_para::past specified."; break; case LBFGSERR_INVALID_DELTA: err_str = "Invalid parameter lbfgs_para::delta specified."; break; case LBFGSERR_INVALID_LINESEARCH: err_str = "Invalid parameter lbfgs_para::linesearch specified."; break; case LBFGSERR_INVALID_MINSTEP: err_str = "Invalid parameter lbfgs_para::max_step specified."; break; case LBFGSERR_INVALID_MAXSTEP: err_str = "Invalid parameter lbfgs_para::max_step specified."; break; case LBFGSERR_INVALID_FTOL: err_str = "Invalid parameter lbfgs_para::ftol specified."; break; case LBFGSERR_INVALID_WOLFE: err_str = "Invalid parameter lbfgs_para::wolfe specified."; break; case LBFGSERR_INVALID_GTOL: err_str = "Invalid parameter lbfgs_para::gtol specified."; break; case LBFGSERR_INVALID_XTOL: err_str = "Invalid parameter lbfgs_para::xtol specified."; break; case LBFGSERR_INVALID_MAXLINESEARCH: err_str = "Invalid parameter lbfgs_para::max_linesearch specified."; break; case LBFGSERR_INVALID_ORTHANTWISE: err_str = "Invalid parameter lbfgs_para::orthantwise_c specified."; break; case LBFGSERR_INVALID_ORTHANTWISE_START: err_str = "Invalid parameter lbfgs_para::orthantwise_start specified."; break; case LBFGSERR_INVALID_ORTHANTWISE_END: err_str = "Invalid parameter lbfgs_para::orthantwise_end specified."; break; case LBFGSERR_OUTOFINTERVAL: err_str = "The line-search step went out of the interval of uncertainty."; break; case LBFGSERR_INCORRECT_TMINMAX: err_str = "A logic error occurred; alternatively, the interval of uncertainty became too small."; break; case LBFGSERR_ROUNDING_ERROR: err_str = "A rounding error occurred; alternatively, no line-search step satisfies the sufficient decrease and curvature conditions."; break; case LBFGSERR_MINIMUMSTEP: err_str = "The line-search step became smaller than lbfgs_para::min_step."; break; case LBFGSERR_MAXIMUMSTEP: err_str = "The line-search step became larger than lbfgs_para::max_step."; break; case LBFGSERR_MAXIMUMLINESEARCH: err_str = "The line-search routine reaches the maximum number of evaluations."; break; case LBFGSERR_MAXIMUMITERATION: err_str = "The algorithm routine reaches the maximum number of iterations."; break; case LBFGSERR_WIDTHTOOSMALL: err_str = "Relative width of the interval of uncertainty is at most lbfgs_para::xtol."; break; case LBFGSERR_INVALIDPARAMETERS: err_str = "A logic error (negative line-search step) occurred."; break; case LBFGSERR_INCREASEGRADIENT: err_str = "The current search direction increases the objective function value."; break; default: err_str = "Unknown error."; break; } if (err_throw && err_code < 0) throw std::runtime_error(err_str.c_str()); else ss << err_str; #if defined _WINDOWS || __WIN32__ if (!err_throw) { if (ss_str.find("ofstream") != std::string::npos) { ss << " "; } else { SetConsoleTextAttribute(GetStdHandle(STD_ERROR_HANDLE), 7); ss << std::endl; } } #else if (!err_throw) { if (ss_str.find("ofstream") != std::string::npos) { ss << " "; } else { ss << "\033[0m" << std::endl; } } #endif return; } void gctl::lbfgs_solver::show_lbfgs_para(std::ostream &ss) { ss << "Solver's Setup Panel\n"; ss << "-----------------------------\n"; ss << "Solver: L-BFGS\n"; ss << "Corrtection | Epsilon | Past | Delta | Max-iter\n"; ss << lbfgs_param_.m << " | " << lbfgs_param_.epsilon << " | " << lbfgs_param_.past << " | " << lbfgs_param_.delta << " | " << lbfgs_param_.max_iterations << "\n"; ss << "-----------------------------\n"; ss << "Line search: "; if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_MORETHUENTE) ss << "morethuente\n"; if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) ss << "backtracking (armijo)\n"; if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) ss << "backtracking (wolfe)\n"; if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) ss << "backtracking (strong wolfe)\n"; if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO_QUAD) ss << "backtracking (quadratic armijo)\n"; ss << "Max-time | Max-step | Min-step | ftol | wolfe | gtol | xtol\n"; ss << lbfgs_param_.max_linesearch << " | " << lbfgs_param_.max_step << " | " << lbfgs_param_.min_step << " | " << lbfgs_param_.ftol << " | " << lbfgs_param_.wolfe << " | " << lbfgs_param_.gtol << " | " << lbfgs_param_.xtol << "\n"; ss << "-----------------------------\n"; ss << "L1 norm | Start index | End index\n"; ss << lbfgs_param_.orthantwise_c << " | " << lbfgs_param_.orthantwise_start << " | " << lbfgs_param_.orthantwise_end << "\n"; ss << "=============================\n"; return; } gctl::lbfgs_para gctl::lbfgs_solver::default_lbfgs_para() { lbfgs_para lp = lbfgs_defparam; return lp; } double gctl::lbfgs_solver::LBFGS_Minimize(array &m, std::ostream &ss, bool err_throw) { double fx = 0.0; clock_t start = clock(); lbfgs_error_str(lbfgs(m, fx, ss), ss, err_throw); clock_t end = clock(); double costime = 1000*(end-start)/(double)CLOCKS_PER_SEC; if (!err_throw) { ss << "Solver: L-BFGS. Time cost: " << costime << " ms" << std::endl; } return fx; } double gctl::lbfgs_solver::LBFGS_MinimizePreconditioned(array &m, std::ostream &ss, bool err_throw) { double fx = 0.0; clock_t start = clock(); lbfgs_error_str(lbfgs_preconditioned(m, fx, ss), ss, true); clock_t end = clock(); double costime = 1000*(end-start)/(double)CLOCKS_PER_SEC; if (!err_throw) { ss << std::endl << "Solver: L-BFGS (Preconditioned). Time cost: " << costime << " ms" << std::endl; } return fx; } gctl::lbfgs_return_code gctl::lbfgs_solver::lbfgs(array &x, double &ptr_fx, std::ostream &ss) { lbfgs_return_code ret = LBFGS_ALREADY_MINIMIZED, ls; int i, j, k, end, bound, ls_count; double step; /* Constant parameters and their default values. */ // m是计算海森矩阵时储存的前序向量大小 const int m = lbfgs_param_.m; array xp, g, gp, pg, d, w, pf, y2; array lm; iteration_data_t *it = nullptr; double ys, yy, Yk; double xnorm, gnorm, beta; double fx = 0.0; double rate = 0.0; /* Check the input parameters for errors. */ if (x.empty()) { return LBFGSERR_INVALID_N; } int n = x.size(); if (lbfgs_param_.epsilon < 0.) { return LBFGSERR_INVALID_EPSILON; } if (lbfgs_param_.past < 0) { return LBFGSERR_INVALID_TESTPERIOD; } if (lbfgs_param_.delta < 0.) { return LBFGSERR_INVALID_DELTA; } if (lbfgs_param_.min_step < 0.) { return LBFGSERR_INVALID_MINSTEP; } if (lbfgs_param_.max_step < lbfgs_param_.min_step) { return LBFGSERR_INVALID_MAXSTEP; } if (lbfgs_param_.ftol < 0.) { return LBFGSERR_INVALID_FTOL; } if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (lbfgs_param_.wolfe <= lbfgs_param_.ftol || 1. <= lbfgs_param_.wolfe) { return LBFGSERR_INVALID_WOLFE; } } if (lbfgs_param_.gtol < 0.) { return LBFGSERR_INVALID_GTOL; } if (lbfgs_param_.xtol < 0.) { return LBFGSERR_INVALID_XTOL; } if (lbfgs_param_.max_linesearch <= 0) { return LBFGSERR_INVALID_MAXLINESEARCH; } if (lbfgs_param_.orthantwise_c < 0.) { return LBFGSERR_INVALID_ORTHANTWISE; } if (lbfgs_param_.orthantwise_start < 0 || n < lbfgs_param_.orthantwise_start) { return LBFGSERR_INVALID_ORTHANTWISE_START; } if (lbfgs_param_.orthantwise_end < 0) { // 默认设置在每个迭代都计算L1模 lbfgs_param_.orthantwise_end = n; } if (n < lbfgs_param_.orthantwise_end) { return LBFGSERR_INVALID_ORTHANTWISE_END; } // 若|x|的参数不是0,则检查线性搜索方法 if (lbfgs_param_.orthantwise_c != 0. && lbfgs_param_.linesearch != LBFGS_LINESEARCH_BACKTRACKING) { return LBFGSERR_INVALID_LINESEARCH; } // 初始化数组 /* Allocate working space. */ xp.resize(n); g.resize(n); gp.resize(n); d.resize(n); w.resize(n); y2.resize(n); // End // 初始化计算L1模的数组 if (lbfgs_param_.orthantwise_c != 0.) { /* Allocate working space for OW-LQN. */ pg.resize(n); } // 初始化有限内存方法需要的空间 /* Allocate limited memory storage. */ lm.resize(m); /* Initialize the limited memory. */ for (i = 0;i < m;++i) { lm[i].alpha = 0; lm[i].ys = 0; lm[i].s.resize(n); lm[i].y.resize(n); } /* Allocate an array for storing previous values of the objective function. */ if (0 < lbfgs_param_.past) { pf.resize(lbfgs_param_.past); } // 到此所有初始化工作完成 下面开始迭代前的初始计算 /* Evaluate the function value and its gradient. */ fx = LBFGS_Evaluate(x, g); // 步长为0,现在用不了 // 若|x|参数不为0 则需要计算x的L1模与似梯度 if (0. != lbfgs_param_.orthantwise_c) { /* Compute the L1 norm of the variable and add it to the object value. */ xnorm = owlqn_x1norm(x, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); fx += xnorm * lbfgs_param_.orthantwise_c; // 此时fx为这两部分的和 owlqn_pseudo_gradient(pg, x, g, n, lbfgs_param_.orthantwise_c, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); // 计算似梯度 } /* Store the initial value of the objective function. */ // 如果param.past不为0,则pf不为NULL if (!pf.empty()) { pf[0] = fx; } /* Compute the direction; we assume the initial hessian matrix H_0 as the identity matrix. */ // 初始下降方向为梯度的反方向 // LBFGS中下降方向为 -1*H_k*g_k if (lbfgs_param_.orthantwise_c == 0.) { veccpy(d, g, -1.0); //拷贝数组 并反号(乘-1) } else { veccpy(d, pg, -1.0); //此时需拷贝似梯度 并反号(乘-1) } /* Make sure that the initial variables are not a minimizer. */ xnorm = sqrt(vecdot(x, x)); // vec2norm计算数组的L2模长 // 此段又要区别对待是否含有L1模的部分 if (lbfgs_param_.orthantwise_c == 0.) { gnorm = sqrt(vecdot(g, g)); } else { gnorm = sqrt(vecdot(pg, pg)); } // 为啥要保证xnorm大于等于1?不明白 // 答 参考头文件中的注释 如果模型参数的模长小于1则使用梯度的模长作为收敛性的测试指标 if (xnorm < 1.0) xnorm = 1.0; // 如果输入x即为最优化的解 则退出 if (gnorm / xnorm <= lbfgs_param_.epsilon) { ret = LBFGS_ALREADY_MINIMIZED; goto lbfgs_exit; } /* Compute the initial step: step = 1.0 / sqrt(vecdot(d, d, n)) */ // 计算估算的初始步长 这一步在实际应用中重要 在相似代码编写中需参考 step = 1.0/sqrt(vecdot(d, d)); // 计算数组L2模的倒数,与注释的内容等效 k = 1; end = 0; for (;;) { /* Store the current position and gradient vectors. */ veccpy(xp, x); // p for previous veccpy(gp, g); /* Search for an optimal step. */ if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING) { ls = line_search_backtracking_owlqn(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_MORETHUENTE) { ls = line_search_morethuente(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO || lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { ls = line_search_backtracking(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO_QUAD) { ls = line_search_backtracking_quad(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else return LBFGSERR_INVALID_LINESEARCH; if (lbfgs_param_.orthantwise_c != 0.) { owlqn_pseudo_gradient(pg, x, g, n, lbfgs_param_.orthantwise_c, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); } // 返回值小于0则表示线性搜索错误 此时则退回到上一次迭代的位置并退出 if (ls < 0) { /* Revert to the previous point. */ veccpy(x, xp); veccpy(g, gp); ret = ls; goto lbfgs_exit; } /* Compute x and g norms. */ xnorm = sqrt(vecdot(x, x)); if (lbfgs_param_.orthantwise_c == 0.) { gnorm = sqrt(vecdot(g, g)); } else { gnorm = sqrt(vecdot(pg, pg)); } /* Convergence test. The criterion is given by the following formula: |g(x)| / \max(1, |x|) < \epsilon 这里稍微解释一下:这个标准的含义是此时的模型梯度模与模型模的比值,这个值在模型变化非常平缓时会很小,一般我们认为此时也就达到了最优。 因此如果x的模小于1的话反而会放大模型梯度的模,所以这里默认x的模长大于等于1.0。同样的,可以预见对于求0值的非线性最优化问题,这个标准 并不适用,因为目标函数在0值的梯度很可能不是0。 */ if (xnorm < 1.0) xnorm = 1.0; if (gnorm / xnorm <= lbfgs_param_.epsilon) { ret = LBFGS_EPS_CONVERGENCE; // break; } if (fx <= lbfgs_param_.residual) { ret = LBFGS_RESI_CONVERGENCE; // break; } /* Test for stopping criterion. The criterion is given by the following formula: |(f(past_x) - f(x))| / f(x) < \delta 利用之前的目标函数值与当前目标函数值之差的绝对值与当前函数值的比值来确定是否终止迭代。与前一种判断方式一样,不适合求0的最优化问题。 */ if (!pf.empty()) { /* We don't test the stopping criterion while k < past. */ if (lbfgs_param_.past <= k) { /* Compute the relative improvement from the past. */ rate = (pf[k % lbfgs_param_.past] - fx) / fx; /* The stopping criterion. */ if (fabs(rate) < lbfgs_param_.delta) { ret = LBFGS_DELTA_CONVERGENCE; // break; } } /* Store the current value of the objective function. */ pf[k % lbfgs_param_.past] = fx; } if (lbfgs_param_.max_iterations != 0 && lbfgs_param_.max_iterations < k+1) { /* Maximum number of iterations. */ ret = LBFGSERR_MAXIMUMITERATION; // break; } /* Report the progress. */ // 如果监控函数返回值不为0 则退出迭过程 if (LBFGS_Progress(x, g, fx, gnorm/xnorm, rate, lbfgs_param_, k, ls_count, ss)) { ret = LBFGS_STOP; } if (ret == LBFGS_EPS_CONVERGENCE || ret == LBFGS_DELTA_CONVERGENCE || ret == LBFGS_RESI_CONVERGENCE || ret == LBFGS_STOP || ret == LBFGSERR_MAXIMUMITERATION) { break; } // 以下是L-BFGS算法的核心部分 /* Update vectors s and y: s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. y_{k+1} = g_{k+1} - g_{k}. */ // 计算最新保存的s和y it = lm.get(end); vecdiff(it->s, x, xp); // 计算两个数组的差 it->s = x - xp vecdiff(it->y, g, gp); /* Compute scalars ys and yy: ys = y^t \cdot s = 1 / \rho. yy = y^t \cdot y. Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor). */ ys = vecdot(it->y, it->s); // 计算两个数组的点积 yy = vecdot(it->y, it->y); it->ys = ys; /* Recursive formula to compute dir = -(H \cdot g). This is described in page 779 of: Jorge Nocedal. Updating Quasi-Newton Matrices with Limited Storage. Mathematics of Computation, Vol. 35, No. 151, pp. 773--782, 1980. */ // m小于迭代次数 bound = (m <= k) ? m : k; ++k; // end+1等于m则将其重置为0 循环保存 end = (end + 1) % m; /* Compute the steepest direction. */ if (lbfgs_param_.orthantwise_c == 0.) { /* Compute the negative of gradients. */ veccpy(d, g, -1.0); // 注意这里有符号的翻转 } else { veccpy(d, pg, -1.0); } // 此处开始迭代 利用双重循环计算H^-1*g得到下降方向 j = end; for (i = 0;i < bound;++i) { j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */ it = lm.get(j); /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */ it->alpha = vecdot(it->s, d); it->alpha /= it->ys; /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */ vecapp(d, it->y, -it->alpha); } // Edit by Yi Zhang on 2022-12-17 //vecscale(d, ys / yy); // 适当缩放d的大小 // Start edit if (ys < yy) { // 这里假设了初始的Hessian矩阵为单位阵乘以ys/yy vecscale(d, ys / yy); // 适当缩放d的大小 } else { // 假设初始的Hessian矩阵为 I + [(ys - yy)/tr(Yk)]diag(y2) vecmul(y2, lm[end].y, lm[end].y); Yk = vecdot(y2, y2); vecapp(d, y2, (ys-yy)/Yk); } // End edit for (i = 0;i < bound;++i) { it = lm.get(j); /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */ beta = vecdot(it->y, d); beta /= it->ys; /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */ vecapp(d, it->s, it->alpha - beta); j = (j + 1) % m; /* if (++j == m) j = 0; */ } /* Constrain the search direction for orthant-wise updates. */ if (lbfgs_param_.orthantwise_c != 0.) { for (i = lbfgs_param_.orthantwise_start;i < lbfgs_param_.orthantwise_end;++i) { if (d[i] * pg[i] >= 0) { d[i] = 0; } } } /* Now the search direction d is ready. We try step = 1 first. */ step = 1.0; } lbfgs_exit: /* Return the final value of the objective function. */ ptr_fx = fx; /* Free memory blocks used by this function. */ if (!lm.empty()) { for (i = 0;i < m;++i) { lm[i].s.clear(); lm[i].y.clear(); } } // End return ret; } gctl::lbfgs_return_code gctl::lbfgs_solver::lbfgs_preconditioned(array &x, double &ptr_fx, std::ostream &ss) { lbfgs_return_code ret = LBFGS_ALREADY_MINIMIZED, ls; int i, j, k, ls_count, end, bound; double step; /* Constant parameters and their default values. */ // m是计算海森矩阵时储存的前序向量大小 const int m = lbfgs_param_.m; array xp, g, gp, pg, d, w, pf, dp, y2; array lm; iteration_data_t *it = nullptr; double ys, yy; double Yk; // Add by Yi Zhang on 02-16-2022 double xnorm, gnorm, beta; double fx = 0.0; double rate = 0.0; /* Check the input parameters for errors. */ if (x.empty()) { return LBFGSERR_INVALID_N; } int n = x.size(); if (lbfgs_param_.epsilon < 0.) { return LBFGSERR_INVALID_EPSILON; } if (lbfgs_param_.past < 0) { return LBFGSERR_INVALID_TESTPERIOD; } if (lbfgs_param_.delta < 0.) { return LBFGSERR_INVALID_DELTA; } if (lbfgs_param_.min_step < 0.) { return LBFGSERR_INVALID_MINSTEP; } if (lbfgs_param_.max_step < lbfgs_param_.min_step) { return LBFGSERR_INVALID_MAXSTEP; } if (lbfgs_param_.ftol < 0.) { return LBFGSERR_INVALID_FTOL; } if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (lbfgs_param_.wolfe <= lbfgs_param_.ftol || 1. <= lbfgs_param_.wolfe) { return LBFGSERR_INVALID_WOLFE; } } if (lbfgs_param_.gtol < 0.) { return LBFGSERR_INVALID_GTOL; } if (lbfgs_param_.xtol < 0.) { return LBFGSERR_INVALID_XTOL; } if (lbfgs_param_.max_linesearch <= 0) { return LBFGSERR_INVALID_MAXLINESEARCH; } if (lbfgs_param_.orthantwise_c < 0.) { return LBFGSERR_INVALID_ORTHANTWISE; } if (lbfgs_param_.orthantwise_start < 0 || n < lbfgs_param_.orthantwise_start) { return LBFGSERR_INVALID_ORTHANTWISE_START; } if (lbfgs_param_.orthantwise_end < 0) { // 默认设置在每个迭代都计算L1模 lbfgs_param_.orthantwise_end = n; } if (n < lbfgs_param_.orthantwise_end) { return LBFGSERR_INVALID_ORTHANTWISE_END; } // 若|x|的参数不是0,则检查线性搜索方法 if (lbfgs_param_.orthantwise_c != 0. && lbfgs_param_.linesearch != LBFGS_LINESEARCH_BACKTRACKING) { return LBFGSERR_INVALID_LINESEARCH; } // 初始化数组 /* Allocate working space. */ xp.resize(n); g.resize(n); gp.resize(n); d.resize(n); w.resize(n); y2.resize(n); dp.resize(n); // End // 初始化计算L1模的数组 if (lbfgs_param_.orthantwise_c != 0.) { /* Allocate working space for OW-LQN. */ pg.resize(n); } // 初始化有限内存方法需要的空间 /* Allocate limited memory storage. */ lm.resize(m); /* Initialize the limited memory. */ for (i = 0;i < m;++i) { lm[i].alpha = 0; lm[i].ys = 0; lm[i].s.resize(n); lm[i].y.resize(n); } /* Allocate an array for storing previous values of the objective function. */ if (0 < lbfgs_param_.past) { pf.resize(lbfgs_param_.past); } // 到此所有初始化工作完成 下面开始迭代前的初始计算 /* Evaluate the function value and its gradient. */ fx = LBFGS_Evaluate(x, g); // 步长为0,现在用不了 // 若|x|参数不为0 则需要计算x的L1模与似梯度 if (0. != lbfgs_param_.orthantwise_c) { /* Compute the L1 norm of the variable and add it to the object value. */ xnorm = owlqn_x1norm(x, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); fx += xnorm * lbfgs_param_.orthantwise_c; // 此时fx为这两部分的和 owlqn_pseudo_gradient(pg, x, g, n, lbfgs_param_.orthantwise_c, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); // 计算似梯度 } /* Store the initial value of the objective function. */ // 如果param.past不为0,则pf不为NULL if (!pf.empty()) { pf[0] = fx; } /* Compute the direction; we assume the initial hessian matrix H_0 as the identity matrix. */ // 初始下降方向为梯度的反方向 // LBFGS中下降方向为 -1*H_k*g_k if (lbfgs_param_.orthantwise_c == 0.) { veccpy(d, g, -1.0); //拷贝数组 并反号(乘-1) } else { veccpy(d, pg, -1.0); //此时需拷贝似梯度 并反号(乘-1) } /* Make sure that the initial variables are not a minimizer. */ xnorm = sqrt(vecdot(x, x)); // vec2norm计算数组的L2模长 // 此段又要区别对待是否含有L1模的部分 if (lbfgs_param_.orthantwise_c == 0.) { gnorm = sqrt(vecdot(g, g)); } else { gnorm = sqrt(vecdot(pg, pg)); } // 为啥要保证xnorm大于等于1?不明白 // 答 参考头文件中的注释 如果模型参数的模长小于1则使用梯度的模长作为收敛性的测试指标 if (xnorm < 1.0) xnorm = 1.0; // 如果输入x即为最优化的解 则退出 if (gnorm / xnorm <= lbfgs_param_.epsilon) { ret = LBFGS_ALREADY_MINIMIZED; goto lbfgs_exit; } /* Compute the initial step: step = 1.0 / sqrt(vecdot(d, d, n)) */ // 计算估算的初始步长 这一步在实际应用中重要 在相似代码编写中需参考 step = 1.0/sqrt(vecdot(d, d)); // 计算数组L2模的倒数,与注释的内容等效 k = 1; end = 0; for (;;) { /* Store the current position and gradient vectors. */ veccpy(xp, x); // p for previous veccpy(gp, g); /* Search for an optimal step. */ if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING) { ls = line_search_backtracking_owlqn(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_MORETHUENTE) { ls = line_search_morethuente(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO || lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { ls = line_search_backtracking(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else if (lbfgs_param_.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO_QUAD) { ls = line_search_backtracking_quad(n, x, &fx, g, d, &step, xp, gp, w, ls_count); } else return LBFGSERR_INVALID_LINESEARCH; if (lbfgs_param_.orthantwise_c != 0.) { owlqn_pseudo_gradient(pg, x, g, n, lbfgs_param_.orthantwise_c, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); } // 返回值小于0则表示线性搜索错误 此时则退回到上一次迭代的位置并退出 if (ls < 0) { /* Revert to the previous point. */ veccpy(x, xp); veccpy(g, gp); ret = ls; // edited by Yi Zhang goto lbfgs_exit; } /* Compute x and g norms. */ xnorm = sqrt(vecdot(x, x)); if (lbfgs_param_.orthantwise_c == 0.) { gnorm = sqrt(vecdot(g, g)); } else { gnorm = sqrt(vecdot(pg, pg)); } /* Convergence test. The criterion is given by the following formula: |g(x)| / \max(1, |x|) < \epsilon 这里稍微解释一下:这个标准的含义是此时的模型梯度模与模型模的比值,这个值在模型变化非常平缓时会很小,一般我们认为此时也就达到了最优。 因此如果x的模小于1的话反而会放大模型梯度的模,所以这里默认x的模长大于等于1.0。同样的,可以预见对于求0值的非线性最优化问题,这个标准 并不适用,因为目标函数在0值的梯度很可能不是0。 */ if (xnorm < 1.0) xnorm = 1.0; if (gnorm / xnorm <= lbfgs_param_.epsilon) { ret = LBFGS_EPS_CONVERGENCE; // break; } if (fx <= lbfgs_param_.residual) { ret = LBFGS_RESI_CONVERGENCE; // break; } /* Test for stopping criterion. The criterion is given by the following formula: |(f(past_x) - f(x))| / f(x) < \delta 利用之前的目标函数值与当前目标函数值之差的绝对值与当前函数值的比值来确定是否终止迭代。与前一种判断方式一样,不适合求0的最优化问题。 */ if (!pf.empty()) { /* We don't test the stopping criterion while k < past. */ if (lbfgs_param_.past <= k) { /* Compute the relative improvement from the past. */ rate = (pf[k % lbfgs_param_.past] - fx) / fx; /* The stopping criterion. */ if (fabs(rate) < lbfgs_param_.delta) { ret = LBFGS_DELTA_CONVERGENCE; // break; } } /* Store the current value of the objective function. */ pf[k % lbfgs_param_.past] = fx; } if (lbfgs_param_.max_iterations != 0 && lbfgs_param_.max_iterations < k+1) { /* Maximum number of iterations. */ ret = LBFGSERR_MAXIMUMITERATION; // break; } /* Report the progress. */ // 如果监控函数返回值不为0 则退出迭过程 if (LBFGS_Progress(x, g, fx, gnorm/xnorm, rate, lbfgs_param_, k, ls_count, ss)) { ret = LBFGS_STOP; } if (ret == LBFGS_EPS_CONVERGENCE || ret == LBFGS_DELTA_CONVERGENCE || ret == LBFGS_RESI_CONVERGENCE || ret == LBFGS_STOP || ret == LBFGSERR_MAXIMUMITERATION) { break; } // 以下是L-BFGS算法的核心部分 /* Update vectors s and y: s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. y_{k+1} = g_{k+1} - g_{k}. */ // 计算最新保存的s和y it = lm.get(end); vecdiff(it->s, x, xp); // 计算两个数组的差 it->s = x - xp vecdiff(it->y, g, gp); /* Compute scalars ys and yy: ys = y^t \cdot s = 1 / \rho. yy = y^t \cdot y. Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor). */ ys = vecdot(it->y, it->s); // 计算两个数组的点积 yy = vecdot(it->y, it->y); it->ys = ys; /* Recursive formula to compute dir = -(H \cdot g). This is described in page 779 of: Jorge Nocedal. Updating Quasi-Newton Matrices with Limited Storage. Mathematics of Computation, Vol. 35, No. 151, pp. 773--782, 1980. */ // m小于迭代次数 bound = (m <= k) ? m : k; ++k; // end+1等于m则将其重置为0 循环保存 end = (end + 1) % m; /* Compute the steepest direction. */ if (lbfgs_param_.orthantwise_c == 0.) { /* Compute the negative of gradients. */ veccpy(d, g, -1.0); // 注意这里有符号的翻转 } else { veccpy(d, pg, -1.0); } // 此处开始迭代 利用双重循环计算H^-1*g得到下降方向 j = end; for (i = 0;i < bound;++i) { j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */ it = lm.get(j); /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */ it->alpha = vecdot(it->s, d); it->alpha /= it->ys; /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */ vecapp(d, it->y, -it->alpha); } // Add by Yi Zhang on 02-16-2022 // Wah June Leong & Chuei Yee Chen (2013) A class of diagonal preconditioners for limited memory BFGS method, // Optimization Methods and Software, 28:2, 379-392, DOI: 10.1080/10556788.2011.653356 // 我们在这里提供一个预优函数的接口 需还原则删除下面的代码段 同时取消上面一行注释 if (lbfgs_param_.orthantwise_c == 0.) { LBFGS_Precondition(x, g, d, dp); } else { LBFGS_Precondition(x, pg, d, dp); } veccpy(d, dp); for (i = 0;i < bound;++i) { it = lm.get(j); /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */ beta = vecdot(it->y, d); beta /= it->ys; /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */ vecapp(d, it->s, it->alpha - beta); j = (j + 1) % m; /* if (++j == m) j = 0; */ } /* Constrain the search direction for orthant-wise updates. */ if (lbfgs_param_.orthantwise_c != 0.) { for (i = lbfgs_param_.orthantwise_start;i < lbfgs_param_.orthantwise_end;++i) { if (d[i] * pg[i] >= 0) { d[i] = 0; } } } /* Now the search direction d is ready. We try step = 1 first. */ step = 1.0; } lbfgs_exit: /* Return the final value of the objective function. */ ptr_fx = fx; /* Free memory blocks used by this function. */ if (!lm.empty()) { for (i = 0;i < m;++i) { lm[i].s.clear(); lm[i].y.clear(); } } // End return ret; } /** * Define the local variables for computing minimizers. */ #define USES_MINIMIZER \ double a, d, gamma, theta, p, q, r, s; /** * Find a minimizer of an interpolated cubic function. * @param cm The minimizer of the interpolated cubic. * @param u The value of one point, u. * @param fu The value of f(u). * @param du The value of f'(u). * @param v The value of another point, v. * @param fv The value of f(v). * @param du The value of f'(v). */ #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \ d = (v) - (u); \ theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \ p = fabs(theta); \ q = fabs(du); \ r = fabs(dv); \ s = max3(p, q, r); \ /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \ a = theta / s; \ gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \ if ((v) < (u)) gamma = -gamma; \ p = gamma - (du) + theta; \ q = gamma - (du) + gamma + (dv); \ r = p / q; \ (cm) = (u) + r * d; /** * Find a minimizer of an interpolated cubic function. * @param cm The minimizer of the interpolated cubic. * @param u The value of one point, u. * @param fu The value of f(u). * @param du The value of f'(u). * @param v The value of another point, v. * @param fv The value of f(v). * @param du The value of f'(v). * @param xmin The maximum value. * @param xmin The minimum value. */ #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \ d = (v) - (u); \ theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \ p = fabs(theta); \ q = fabs(du); \ r = fabs(dv); \ s = max3(p, q, r); \ /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \ a = theta / s; \ gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \ if ((u) < (v)) gamma = -gamma; \ p = gamma - (dv) + theta; \ q = gamma - (dv) + gamma + (du); \ r = p / q; \ if (r < 0. && gamma != 0.) { \ (cm) = (v) - r * d; \ } else if (a < 0) { \ (cm) = (xmax); \ } else { \ (cm) = (xmin); \ } /** * Find a minimizer of an interpolated quadratic function. * @param qm The minimizer of the interpolated quadratic. * @param u The value of one point, u. * @param fu The value of f(u). * @param du The value of f'(u). * @param v The value of another point, v. * @param fv The value of f(v). */ #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \ a = (v) - (u); \ (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a; /** * Find a minimizer of an interpolated quadratic function. * @param qm The minimizer of the interpolated quadratic. * @param u The value of one point, u. * @param du The value of f'(u). * @param v The value of another point, v. * @param dv The value of f'(v). */ #define QUARD_MINIMIZER2(qm, u, du, v, dv) \ a = (u) - (v); \ (qm) = (v) + (dv) / ((dv) - (du)) * a; gctl::lbfgs_return_code gctl::lbfgs_solver::line_search_backtracking(int n, gctl::array &x, double *f, gctl::array &g, gctl::array &s, double *stp, const gctl::array &xp, const gctl::array &gp, gctl::array &wp, int &ls_count) { ls_count = 0; double width, dg; double finit, dginit = 0., dgtest; const double dec = 0.5, inc = 2.1; /* Check the input parameters for errors. */ if (*stp <= 0.) { return gctl::LBFGSERR_INVALIDPARAMETERS; } /* Compute the initial gradient in the search direction. */ dginit = vecdot(g, s); //计算点积 g为梯度方向 s为下降方向 /* Make sure that s points to a descent direction. */ if (0 < dginit) { return gctl::LBFGSERR_INCREASEGRADIENT; } /* The initial value of the objective function. */ finit = *f; dgtest = lbfgs_param_.ftol * dginit; // ftol 大概为 function tolerance for (;;) { veccpy(x, xp); vecapp(x, s, *stp); // vecadd x += (*stp)*s /* Evaluate the function and gradient values. */ // 这里我们发现的cd的用法,即传递函数指针 *f = LBFGS_Evaluate(x, g); ls_count++; // 充分下降条件 if (*f > finit + *stp * dgtest) { width = dec; //如果不满充分下降条件则减小步长 } else { // 充分下降条件满足并搜索方法为backtracking,搜索条件为Armijo,则可以退出了。否则更新步长,继续搜索。 /* The sufficient decrease condition (Armijo condition). */ if (lbfgs_param_.linesearch == gctl::LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) { /* Exit with the Armijo condition. */ return gctl::LBFGS_STOP; } /* Check the Wolfe condition. */ dg = vecdot(g, s); // 验证标准Wolfe条件 需要计算新的梯度信息 if (dg < lbfgs_param_.wolfe * dginit) { width = inc; //注意这里dginit一般是负的,所以在测试Wolfe条件时上式为下限。不满足标准Wolfe条件,增大步长。 } else { // 标准Wolfe条件满足,且搜索方法为backtracking,搜索条件为Wolfe,则可以退出了。否则继续测试是否满足Strong Wolfe if(lbfgs_param_.linesearch == gctl::LBFGS_LINESEARCH_BACKTRACKING_WOLFE) { /* Exit with the regular Wolfe condition. */ return gctl::LBFGS_STOP; } /* Check the strong Wolfe condition. */ if(dg > -1.0 * lbfgs_param_.wolfe * dginit) { width = dec; //不满足曲率的绝对值条件,减小步长。 } else { /* Exit with the strong Wolfe condition. */ return gctl::LBFGS_STOP; } } } // 以下情况返回的步长不能保证满足搜索条件 if (*stp < lbfgs_param_.min_step) { /* The step is the minimum value. */ // 退出 此时步长小于最小步长 return gctl::LBFGSERR_MINIMUMSTEP; } if (*stp > lbfgs_param_.max_step) { /* The step is the maximum value. */ // 退出 此时步长大于最大步长 return gctl::LBFGSERR_MAXIMUMSTEP; } if (lbfgs_param_.max_linesearch <= ls_count) { /* Maximum number of iteration. */ // 退出 线性搜索次数超过了最大限制 return gctl::LBFGSERR_MAXIMUMLINESEARCH; } (*stp) *= width; } } gctl::lbfgs_return_code gctl::lbfgs_solver::line_search_backtracking_quad(int n, gctl::array &x, double *f, gctl::array &g, gctl::array &s, double *stp, const gctl::array &xp, const gctl::array &gp, gctl::array &wp, int &ls_count) { ls_count = 0; double dg, stp2; double finit, dginit = 0., dgtest; const double dec = 0.5; /* Check the input parameters for errors. */ if (*stp <= 0.) { return gctl::LBFGSERR_INVALIDPARAMETERS; } /* Compute the initial gradient in the search direction. */ dginit = vecdot(g, s); //计算点积 g为梯度方向 s为下降方向 /* Make sure that s points to a descent direction. */ if (0 < dginit) { return gctl::LBFGSERR_INCREASEGRADIENT; } /* The initial value of the objective function. */ finit = *f; dgtest = lbfgs_param_.ftol * dginit; // ftol 大概为 function tolerance for (;;) { veccpy(x, xp); vecapp(x, s, *stp); // vecadd x += (*stp)*s /* Evaluate the function and gradient values. */ // 这里我们发现的cd的用法,即传递函数指针 *f = LBFGS_Evaluate(x, g); ls_count++; // 充分下降条件 if (*f > finit + *stp * dgtest) { stp2 = 0.5*dginit*(*stp)*(*stp)/(finit - (*f) + dginit*(*stp)); if (stp2 < 0) { (*stp) *= dec; } else { (*stp) = stp2; } } else { // 充分下降条件满足并搜索方法为backtracking,搜索条件为Armijo,则可以退出了。否则更新步长,继续搜索。 /* The sufficient decrease condition (Armijo condition). */ if (lbfgs_param_.linesearch == gctl::LBFGS_LINESEARCH_BACKTRACKING_ARMIJO_QUAD) { /* Exit with the Armijo condition. */ return gctl::LBFGS_STOP; } } // 以下情况返回的步长不能保证满足搜索条件 if (*stp < lbfgs_param_.min_step) { /* The step is the minimum value. */ // 退出 此时步长小于最小步长 return gctl::LBFGSERR_MINIMUMSTEP; } if (*stp > lbfgs_param_.max_step) { /* The step is the maximum value. */ // 退出 此时步长大于最大步长 return gctl::LBFGSERR_MAXIMUMSTEP; } if (lbfgs_param_.max_linesearch <= ls_count) { /* Maximum number of iteration. */ // 退出 线性搜索次数超过了最大限制 return gctl::LBFGSERR_MAXIMUMLINESEARCH; } } } gctl::lbfgs_return_code gctl::lbfgs_solver::line_search_backtracking_owlqn(int n, gctl::array &x, double *f, gctl::array &g, gctl::array &s, double *stp, const gctl::array &xp, const gctl::array &gp, gctl::array &wp, int &ls_count) { int i; ls_count = 0; double width = 0.5, norm = 0.; double finit = *f, dgtest; /* Check the input parameters for errors. */ if (*stp <= 0.) { return gctl::LBFGSERR_INVALIDPARAMETERS; } /* Choose the orthant for the new point. */ for (i = 0;i < n; ++i) { wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i]; } for (;;) { /* Update the current point. */ veccpy(x, xp); vecapp(x, s, *stp); /* The current point is projected onto the orthant. */ owlqn_project(x, wp, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); /* Evaluate the function and gradient values. */ *f = LBFGS_Evaluate(x, g); /* Compute the L1 norm of the variables and add it to the object value. */ norm = owlqn_x1norm(x, lbfgs_param_.orthantwise_start, lbfgs_param_.orthantwise_end); *f += norm * lbfgs_param_.orthantwise_c; ls_count++; dgtest = 0.; for (i = 0;i < n;++i) { dgtest += (x[i] - xp[i]) * gp[i]; } if (*f <= finit + lbfgs_param_.ftol * dgtest) { /* The sufficient decrease condition. */ return gctl::LBFGS_STOP; } if (*stp < lbfgs_param_.min_step) { /* The step is the minimum value. */ return gctl::LBFGSERR_MINIMUMSTEP; } if (*stp > lbfgs_param_.max_step) { /* The step is the maximum value. */ return gctl::LBFGSERR_MAXIMUMSTEP; } if (lbfgs_param_.max_linesearch <= ls_count) { /* Maximum number of iteration. */ return gctl::LBFGSERR_MAXIMUMLINESEARCH; } (*stp) *= width; } } gctl::lbfgs_return_code gctl::lbfgs_solver::line_search_morethuente(int n, gctl::array &x, double *f, gctl::array &g, gctl::array &s, double *stp, const gctl::array &xp, const gctl::array &gp, gctl::array &wa, int &ls_count) { ls_count = 0; int brackt, stage1, uinfo = 0; double dg; double stx, fx, dgx; double sty, fy, dgy; double fxm, dgxm, fym, dgym, fm, dgm; double finit, ftest1, dginit, dgtest; double width, prev_width; double stmin, stmax; /* Check the input parameters for errors. */ if (*stp <= 0.) { return gctl::LBFGSERR_INVALIDPARAMETERS; } /* Compute the initial gradient in the search direction. */ dginit = vecdot(g, s); /* Make sure that s points to a descent direction. */ if (0 < dginit) { return gctl::LBFGSERR_INCREASEGRADIENT; } /* Initialize local variables. */ brackt = 0; stage1 = 1; finit = *f; dgtest = lbfgs_param_.ftol * dginit; width = lbfgs_param_.max_step - lbfgs_param_.min_step; prev_width = 2.0 * width; /* The variables stx, fx, dgx contain the values of the step, function, and directional derivative at the best step. The variables sty, fy, dgy contain the value of the step, function, and derivative at the other endpoint of the interval of uncertainty. The variables stp, f, dg contain the values of the step, function, and derivative at the current step. */ stx = sty = 0.; fx = fy = finit; dgx = dgy = dginit; for (;;) { /* Set the minimum and maximum steps to correspond to the present interval of uncertainty. */ if (brackt) { stmin = min2(stx, sty); stmax = max2(stx, sty); } else { stmin = stx; stmax = *stp + 4.0 * (*stp - stx); } /* Clip the step in the range of [stpmin, stpmax]. */ if (*stp < lbfgs_param_.min_step) *stp = lbfgs_param_.min_step; if (lbfgs_param_.max_step < *stp) *stp = lbfgs_param_.max_step; /* If an unusual termination is to occur then let stp be the lowest point obtained so far. */ if ((brackt && ((*stp <= stmin || stmax <= *stp) || lbfgs_param_.max_linesearch <= ls_count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= lbfgs_param_.xtol * stmax))) { *stp = stx; } /* Compute the current value of x: x <- x + (*stp) * s. */ veccpy(x, xp); vecapp(x, s, *stp); /* Evaluate the function and gradient values. */ *f = LBFGS_Evaluate(x, g); dg = vecdot(g, s); ftest1 = finit + *stp * dgtest; ls_count++; /* Test for errors and convergence. */ if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) { /* Rounding errors prevent further progress. */ return gctl::LBFGSERR_ROUNDING_ERROR; } if (*stp == lbfgs_param_.max_step && *f <= ftest1 && dg <= dgtest) { /* The step is the maximum value. */ return gctl::LBFGSERR_MAXIMUMSTEP; } if (*stp == lbfgs_param_.min_step && (ftest1 < *f || dgtest <= dg)) { /* The step is the minimum value. */ return gctl::LBFGSERR_MINIMUMSTEP; } if (brackt && (stmax - stmin) <= lbfgs_param_.xtol * stmax) { /* Relative width of the interval of uncertainty is at most xtol. */ return gctl::LBFGSERR_WIDTHTOOSMALL; } if (lbfgs_param_.max_linesearch <= ls_count) { /* Maximum number of iteration. */ return gctl::LBFGSERR_MAXIMUMLINESEARCH; } if (*f <= ftest1 && fabs(dg) <= lbfgs_param_.gtol * (-dginit)) { /* The sufficient decrease condition and the directional derivative condition hold. */ return gctl::LBFGS_STOP; } /* In the first stage we seek a step for which the modified function has a nonpositive value and nonnegative derivative. */ if (stage1 && *f <= ftest1 && min2(lbfgs_param_.ftol, lbfgs_param_.gtol) * dginit <= dg) { stage1 = 0; } /* A modified function is used to predict the step only if we have not obtained a step for which the modified function has a nonpositive function value and nonnegative derivative, and if a lower function value has been obtained but the decrease is not sufficient. */ if (stage1 && ftest1 < *f && *f <= fx) { /* Define the modified function and derivative values. */ fm = *f - *stp * dgtest; fxm = fx - stx * dgtest; fym = fy - sty * dgtest; dgm = dg - dgtest; dgxm = dgx - dgtest; dgym = dgy - dgtest; /* Call update_trial_interval() to update the interval of uncertainty and to compute the new step. */ uinfo = update_trial_interval( &stx, &fxm, &dgxm, &sty, &fym, &dgym, stp, &fm, &dgm, stmin, stmax, &brackt ); /* Reset the function and gradient values for f. */ fx = fxm + stx * dgtest; fy = fym + sty * dgtest; dgx = dgxm + dgtest; dgy = dgym + dgtest; } else { /* Call update_trial_interval() to update the interval of uncertainty and to compute the new step. */ uinfo = update_trial_interval( &stx, &fx, &dgx, &sty, &fy, &dgy, stp, f, &dg, stmin, stmax, &brackt ); } /* Force a sufficient decrease in the interval of uncertainty. */ if (brackt) { if (0.66 * prev_width <= fabs(sty - stx)) { *stp = stx + 0.5 * (sty - stx); } prev_width = width; width = fabs(sty - stx); } } return gctl::LBFGSERR_LOGICERROR; } int update_trial_interval(double *x, double *fx, double *dx, double *y, double *fy, double *dy, double *t, double *ft, double *dt, const double tmin, const double tmax, int *brackt) { int bound; int dsign = fsigndiff(dt, dx); double mc; /* minimizer of an interpolated cubic. */ double mq; /* minimizer of an interpolated quadratic. */ double newt; /* new trial value. */ USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */ /* Check the input parameters for errors. */ if (*brackt) { if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) { /* The trival value t is out of the interval. */ return gctl::LBFGSERR_OUTOFINTERVAL; } if (0. <= *dx * (*t - *x)) { /* The function must decrease from x. */ return gctl::LBFGSERR_INCREASEGRADIENT; } if (tmax < tmin) { /* Incorrect tmin and tmax specified. */ return gctl::LBFGSERR_INCORRECT_TMINMAX; } } /* Trial value selection. */ if (*fx < *ft) { /* Case 1: a higher function value. The minimum is brackt. If the cubic minimizer is closer to x than the quadratic one, the cubic one is taken, else the average of the minimizers is taken. */ *brackt = 1; bound = 1; CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt); QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft); if (fabs(mc - *x) < fabs(mq - *x)) { newt = mc; } else { newt = mc + 0.5 * (mq - mc); } } else if (dsign) { /* Case 2: a lower function value and derivatives of opposite sign. The minimum is brackt. If the cubic minimizer is closer to x than the quadratic (secant) one, the cubic one is taken, else the quadratic one is taken. */ *brackt = 1; bound = 0; CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt); QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt); if (fabs(mc - *t) > fabs(mq - *t)) { newt = mc; } else { newt = mq; } } else if (fabs(*dt) < fabs(*dx)) { /* Case 3: a lower function value, derivatives of the same sign, and the magnitude of the derivative decreases. The cubic minimizer is only used if the cubic tends to infinity in the direction of the minimizer or if the minimum of the cubic is beyond t. Otherwise the cubic minimizer is defined to be either tmin or tmax. The quadratic (secant) minimizer is also computed and if the minimum is brackt then the the minimizer closest to x is taken, else the one farthest away is taken. */ bound = 1; CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax); QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt); if (*brackt) { if (fabs(*t - mc) < fabs(*t - mq)) { newt = mc; } else { newt = mq; } } else { if (fabs(*t - mc) > fabs(*t - mq)) { newt = mc; } else { newt = mq; } } } else { /* Case 4: a lower function value, derivatives of the same sign, and the magnitude of the derivative does not decrease. If the minimum is not brackt, the step is either tmin or tmax, else the cubic minimizer is taken. */ bound = 0; if (*brackt) { CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy); } else if (*x < *t) { newt = tmax; } else { newt = tmin; } } /* Update the interval of uncertainty. This update does not depend on the new step or the case analysis above. - Case a: if f(x) < f(t), x <- x, y <- t. - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0, x <- t, y <- y. - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0, x <- t, y <- x. */ if (*fx < *ft) { /* Case a */ *y = *t; *fy = *ft; *dy = *dt; } else { /* Case c */ if (dsign) { *y = *x; *fy = *fx; *dy = *dx; } /* Cases b and c */ *x = *t; *fx = *ft; *dx = *dt; } /* Clip the new trial value in [tmin, tmax]. */ if (tmax < newt) newt = tmax; if (newt < tmin) newt = tmin; /* Redefine the new trial value if it is close to the upper bound of the interval. */ if (*brackt && bound) { mq = *x + 0.66 * (*y - *x); if (*x < *y) { if (mq < newt) newt = mq; } else { if (newt < mq) newt = mq; } } /* Return the new trial value. */ *t = newt; return 0; } double owlqn_x1norm(const gctl::array &x, const int start, const int n) { double norm = 0.0; for (int i = start;i < n;++i) { norm += fabs(x[i]); } return norm; } void owlqn_pseudo_gradient(gctl::array &pg, const gctl::array &x, const gctl::array &g, const int n, const double c, const int start, const int end) { int i; /* Compute the negative of gradients. */ for (i = 0;i < start;++i) { pg[i] = g[i]; } /* Compute the psuedo-gradients. */ for (i = start;i < end;++i) { if (x[i] < 0.) { /* Differentiable. */ pg[i] = g[i] - c; } else if (0. < x[i]) { /* Differentiable. */ pg[i] = g[i] + c; } else { if (g[i] < -c) { /* Take the right partial derivative. */ pg[i] = g[i] + c; } else if (c < g[i]) { /* Take the left partial derivative. */ pg[i] = g[i] - c; } else { pg[i] = 0.; } } } for (i = end;i < n;++i) { pg[i] = g[i]; } return; } void owlqn_project(gctl::array &d, const gctl::array &sign, const int start, const int end) { for (int i = start; i < end; ++i) { if (d[i] * sign[i] <= 0) { d[i] = 0; } } return; }