[優化]Levenberg-Marquardt 最小二乘優化

LM(Levenberg-Marquardt)演算法屬於信賴域法，將變數行走的長度控制在一定的信賴域之內，保證泰勒展開有很好的近似效果。
LM演算法使用了一種帶阻尼的高斯－牛頓方法。

１．理論

最小二乘問題

$x=argmin_{x} {F(x)}=argmin_{x} frac{1}{2} sum_{i=1}^Nparallel f(x) parallel ^2 \ F(x)=frac{1}{2} sum_{i=1}^Nparallel f(x) parallel ^2=frac{1}{2}parallelmathbf{f}(x)parallel^2=frac{1}{2}mathbf{f}(x)^Tmathbf{f}(x)$

將一階泰勒展開：

$mathbf{f}(x+h)=mathbf{f}(x)+mathbf{J}(x) h + O(h^Th) \$

去掉高階項，帶入到 :

$F(x+h)approx L(h)=frac{1}{2}mathbf{f}^Tmathbf{f}+h^Tmathbf{J}mathbf{f}+frac{1}{2}h^Tmathbf{J}^Tmathbf{J}h$

阻尼法的的思想是再加入一個阻尼項 $frac{1}{2}mu h^Th$ ：

$h=argmin_h {mathbf{G}(h)} = argmin_h frac{1}{2}mathbf{f}^Tmathbf{f}+h^Tmathbf{J}^Tmathbf{f}+frac{1}{2}h^Tmathbf{J}^Tmathbf{J}h + frac{1}{2}mu h^Th$

對上式求偏導數，並令為０．

$mathbf{J}^Tf+mathbf{J}^Tmathbf{J}h+mu h= 0 \ (mathbf{J}^Tmathbf{J}+mu mathbf{I})h=-mathbf{J}^Tf\ (mathbf{H}+mu mathbf{I} )h=-g$

阻尼參數的作用有：

1. 對與，正定，保證了是梯度下降的方向。

2. 當較大時： $happrox-frac{1}{mu}g=-frac{1}{mu}F(x)$ ，其實就是梯度、最速下降法，當離最終結果較遠的時候，很好。
3. 當較小時，方法接近與高斯牛頓，當離最終結果很近時，可以獲得二次收斂速度，很好。

看來，的選取很重要。初始時，取

$mathbf{A}_0=mathbf{J}(x_0)^Tmathbf{J}(x_0)\ mu_0= au cdot max_i{a_{ii}^0}$

其他情況，利用cost增益來確定:

$varrho=frac{F(x)-f(x+h)}{ L(0)-L(h)}$

迭代終止條件

１．一階導數為０：，使用 , 是設定的終止條件；
２．ｘ變化步距離足夠小， $parallel hparallel=parallel x_{new}-x parallel < varepsilon_2( parallel x parallel + varepsilon_2 )$ ；３．超過最大迭代次數。

LM演算法的步驟為

begin

$found=(parallel g parallel _infty < varepsilon_1); mu:= aucdot max{a_{ii}}$ while(not found) and k < kmax if( found = true; else $x_{new}:= x+h$ $varrho=frac{F(x)-f(x+h)}{ L(0)-L(h)}$

if( ) {判斷能不能接收這一步}
$x:=x_{new}$ $mu:=mu cdot max{frac13, 1-(2varrho-1)^3 }; u=2$ else

end

2. 演算法實現

問題：（高斯牛頓同款問題）非線性方程: ，給定組觀測數據 ${x_i,y_i}$ ，求係數 .

分析：令 ,N組數據可以組成一個大的非線性方程組

$F(X)=left[egin{array}[c]{c} y_1-exp(ax_1^2+bx_1+c)\ dots\ y_N-exp(ax_N^2+bx_N+c)end{array} ight]$

我們可以構建一個最小二乘問題：

$x = mathrm{arg}min_{x}frac{1}{2}parallel F(X) parallel^2$ .

要求解這個問題，根據推導部分可知，需要求解雅克比。

$J(X)=left[egin{matrix} -x_1^2exp(ax_1^2+bx_1+c) & -x_1exp(ax_1^2+bx_1+c) &-exp(ax_1^2+bx_1+c) \ dots & dots & dots \ -x_N^2exp(ax_N^2+bx_N+c) & -x_Nexp(ax_N^2+bx_N+c) &-exp(ax_N^2+bx_N+c) end{matrix} ight]$

使用推導部分所述的步驟就可以進行解算。代碼實現：

ydsf16/LevenbergMarquardt?

github.com

/** * This file is part of LevenbergMarquardt Solver. * * Copyright (C) 2018-2020 Dongsheng Yang <[email protected]> (Beihang University) * For more information see <https://github.com/ydsf16/LevenbergMarquardt> * * LevenbergMarquardt is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * LevenbergMarquardt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with LevenbergMarquardt. If not, see <http://www.gnu.org/licenses/>. */

#include <iostream>
#include <eigen3/Eigen/Core>
#include <eigen3/Eigen/Dense>
#include <opencv2/opencv.hpp>
#include <eigen3/Eigen/Cholesky>
#include <chrono>

/* 計時類 */
class Runtimer{
public:
inline void start()
{
t_s_ = std::chrono::steady_clock::now();
}

inline void stop()
{
t_e_ = std::chrono::steady_clock::now();
}

inline double duration()
{
return std::chrono::duration_cast<std::chrono::duration<double>>(t_e_ - t_s_).count() * 1000.0;
}

private:
std::chrono::steady_clock::time_point t_s_; //start time ponit
std::chrono::steady_clock::time_point t_e_; //stop time point
};

/* 優化方程 */
class LevenbergMarquardt{
public:
LevenbergMarquardt(double* a, double* b, double* c):
a_(a), b_(b), c_(c)
{
epsilon_1_ = 1e-6;
epsilon_2_ = 1e-6;
max_iter_ = 50;
is_out_ = true;
}

void setParameters(double epsilon_1, double epsilon_2, int max_iter, bool is_out)
{
epsilon_1_ = epsilon_1;
epsilon_2_ = epsilon_2;
max_iter_ = max_iter;
is_out_ = is_out;
}

void addObservation(const double& x, const double& y)
{
obs_.push_back(Eigen::Vector2d(x, y));
}

void calcJ_fx()
{
J_ .resize(obs_.size(), 3);
fx_.resize(obs_.size(), 1);

for ( size_t i = 0; i < obs_.size(); i ++)
{
const Eigen::Vector2d& ob = obs_.at(i);
const double& x = ob(0);
const double& y = ob(1);
double j1 = -x*x*exp(*a_ * x*x + *b_*x + *c_);
double j2 = -x*exp(*a_ * x*x + *b_*x + *c_);
double j3 = -exp(*a_ * x*x + *b_*x + *c_);
J_(i, 0 ) = j1;
J_(i, 1) = j2;
J_(i, 2) = j3;
fx_(i, 0) = y - exp( *a_ *x*x + *b_*x +*c_);
}
}

void calcH_g()
{
H_ = J_.transpose() * J_;
g_ = -J_.transpose() * fx_;
}

double getCost()
{
Eigen::MatrixXd cost= fx_.transpose() * fx_;
return cost(0,0);
}

double F(double a, double b, double c)
{
Eigen::MatrixXd fx;
fx.resize(obs_.size(), 1);

for ( size_t i = 0; i < obs_.size(); i ++)
{
const Eigen::Vector2d& ob = obs_.at(i);
const double& x = ob(0);
const double& y = ob(1);
fx(i, 0) = y - exp( a *x*x + b*x +c);
}
Eigen::MatrixXd F = 0.5 * fx.transpose() * fx;
return F(0,0);
}

double L0_L( Eigen::Vector3d& h)
{
Eigen::MatrixXd L = -h.transpose() * J_.transpose() * fx_ - 0.5 * h.transpose() * J_.transpose() * J_ * h;
return L(0,0);
}

void solve()
{
int k = 0;
double nu = 2.0;
calcJ_fx();
calcH_g();
bool found = ( g_.lpNorm<Eigen::Infinity>() < epsilon_1_ );

std::vector<double> A;
A.push_back( H_(0, 0) );
A.push_back( H_(1, 1) );
A.push_back( H_(2,2) );
auto max_p = std::max_element(A.begin(), A.end());
double mu = *max_p;

double sumt =0;

while ( !found && k < max_iter_)
{
Runtimer t;
t.start();

k = k +1;
Eigen::Matrix3d G = H_ + mu * Eigen::Matrix3d::Identity();
Eigen::Vector3d h = G.ldlt().solve(g_);

if( h.norm() <= epsilon_2_ * ( sqrt(*a_**a_ + *b_**b_ + *c_**c_ ) +epsilon_2_ ) )
found = true;
else
{
double na = *a_ + h(0);
double nb = *b_ + h(1);
double nc = *c_ + h(2);

double rho =( F(*a_, *b_, *c_) - F(na, nb, nc) ) / L0_L(h);

if( rho > 0)
{
*a_ = na;
*b_ = nb;
*c_ = nc;
calcJ_fx();
calcH_g();

found = ( g_.lpNorm<Eigen::Infinity>() < epsilon_1_ );
mu = mu * std::max<double>(0.33, 1 - std::pow(2*rho -1, 3));
nu = 2.0;
}
else
{
mu = mu * nu;
nu = 2*nu;
}// if rho > 0
}// if step is too small

t.stop();
if( is_out_ )
{
std::cout << "Iter: " << std::left <<std::setw(3) << k << " Result: "<< std::left <<std::setw(10) << *a_ << " " << std::left <<std::setw(10) << *b_ << " " << std::left <<std::setw(10) << *c_ <<
" step: " << std::left <<std::setw(14) << h.norm() << " cost: "<< std::left <<std::setw(14) << getCost() << " time: " << std::left <<std::setw(14) << t.duration() <<
" total_time: "<< std::left <<std::setw(14) << (sumt += t.duration()) << std::endl;
}
} // while

if( found == true)
std::cout << "
Converged

";
else
std::cout << "
Diverged

}//function

Eigen::MatrixXd fx_;
Eigen::MatrixXd J_; // 雅克比矩陣
Eigen::Matrix3d H_; // H矩陣
Eigen::Vector3d g_;

std::vector< Eigen::Vector2d> obs_; // 觀測

/* 要求的三個參數 */
double* a_, *b_, *c_;

/* parameters */
double epsilon_1_, epsilon_2_;
int max_iter_;
bool is_out_;
};//class LevenbergMarquardt
int main(int argc, char **argv) {
const double aa = 0.1, bb = 0.5, cc = 2; // 實際方程的參數
double a =0.0, b=0.0, c=0.0; // 初值

/* 構造問題 */
LevenbergMarquardt lm(&a, &b, &c);
lm.setParameters(1e-10, 1e-10, 100, true);

/* 製造數據 */
const size_t N = 100; //數據個數
cv::RNG rng(cv::getTickCount());
for( size_t i = 0; i < N; i ++)
{
/* 生產帶有高斯雜訊的數據　*/
double x = rng.uniform(0.0, 1.0) ;
double y = exp(aa*x*x + bb*x + cc) + rng.gaussian(0.05);