1 // Ceres Solver - A fast non-linear least squares minimizer 2 // Copyright 2012 Google Inc. All rights reserved. 3 // http://code.google.com/p/ceres-solver/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // 8 // * Redistributions of source code must retain the above copyright notice, 9 // this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above copyright notice, 11 // this list of conditions and the following disclaimer in the documentation 12 // and/or other materials provided with the distribution. 13 // * Neither the name of Google Inc. nor the names of its contributors may be 14 // used to endorse or promote products derived from this software without 15 // specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 // 29 // Author: sameeragarwal (at) google.com (Sameer Agarwal) 30 31 #include "ceres/line_search_direction.h" 32 #include "ceres/line_search_minimizer.h" 33 #include "ceres/low_rank_inverse_hessian.h" 34 #include "ceres/internal/eigen.h" 35 #include "glog/logging.h" 36 37 namespace ceres { 38 namespace internal { 39 40 class SteepestDescent : public LineSearchDirection { 41 public: 42 virtual ~SteepestDescent() {} 43 bool NextDirection(const LineSearchMinimizer::State& previous, 44 const LineSearchMinimizer::State& current, 45 Vector* search_direction) { 46 *search_direction = -current.gradient; 47 return true; 48 } 49 }; 50 51 class NonlinearConjugateGradient : public LineSearchDirection { 52 public: 53 NonlinearConjugateGradient(const NonlinearConjugateGradientType type, 54 const double function_tolerance) 55 : type_(type), 56 function_tolerance_(function_tolerance) { 57 } 58 59 bool NextDirection(const LineSearchMinimizer::State& previous, 60 const LineSearchMinimizer::State& current, 61 Vector* search_direction) { 62 double beta = 0.0; 63 Vector gradient_change; 64 switch (type_) { 65 case FLETCHER_REEVES: 66 beta = current.gradient_squared_norm / previous.gradient_squared_norm; 67 break; 68 case POLAK_RIBIERE: 69 gradient_change = current.gradient - previous.gradient; 70 beta = (current.gradient.dot(gradient_change) / 71 previous.gradient_squared_norm); 72 break; 73 case HESTENES_STIEFEL: 74 gradient_change = current.gradient - previous.gradient; 75 beta = (current.gradient.dot(gradient_change) / 76 previous.search_direction.dot(gradient_change)); 77 break; 78 default: 79 LOG(FATAL) << "Unknown nonlinear conjugate gradient type: " << type_; 80 } 81 82 *search_direction = -current.gradient + beta * previous.search_direction; 83 const double directional_derivative = 84 current.gradient.dot(*search_direction); 85 if (directional_derivative > -function_tolerance_) { 86 LOG(WARNING) << "Restarting non-linear conjugate gradients: " 87 << directional_derivative; 88 *search_direction = -current.gradient; 89 }; 90 91 return true; 92 } 93 94 private: 95 const NonlinearConjugateGradientType type_; 96 const double function_tolerance_; 97 }; 98 99 class LBFGS : public LineSearchDirection { 100 public: 101 LBFGS(const int num_parameters, 102 const int max_lbfgs_rank, 103 const bool use_approximate_eigenvalue_bfgs_scaling) 104 : low_rank_inverse_hessian_(num_parameters, 105 max_lbfgs_rank, 106 use_approximate_eigenvalue_bfgs_scaling), 107 is_positive_definite_(true) {} 108 109 virtual ~LBFGS() {} 110 111 bool NextDirection(const LineSearchMinimizer::State& previous, 112 const LineSearchMinimizer::State& current, 113 Vector* search_direction) { 114 CHECK(is_positive_definite_) 115 << "Ceres bug: NextDirection() called on L-BFGS after inverse Hessian " 116 << "approximation has become indefinite, please contact the " 117 << "developers!"; 118 119 low_rank_inverse_hessian_.Update( 120 previous.search_direction * previous.step_size, 121 current.gradient - previous.gradient); 122 123 search_direction->setZero(); 124 low_rank_inverse_hessian_.RightMultiply(current.gradient.data(), 125 search_direction->data()); 126 *search_direction *= -1.0; 127 128 if (search_direction->dot(current.gradient) >= 0.0) { 129 LOG(WARNING) << "Numerical failure in L-BFGS update: inverse Hessian " 130 << "approximation is not positive definite, and thus " 131 << "initial gradient for search direction is positive: " 132 << search_direction->dot(current.gradient); 133 is_positive_definite_ = false; 134 return false; 135 } 136 137 return true; 138 } 139 140 private: 141 LowRankInverseHessian low_rank_inverse_hessian_; 142 bool is_positive_definite_; 143 }; 144 145 class BFGS : public LineSearchDirection { 146 public: 147 BFGS(const int num_parameters, 148 const bool use_approximate_eigenvalue_scaling) 149 : num_parameters_(num_parameters), 150 use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling), 151 initialized_(false), 152 is_positive_definite_(true) { 153 LOG_IF(WARNING, num_parameters_ >= 1e3) 154 << "BFGS line search being created with: " << num_parameters_ 155 << " parameters, this will allocate a dense approximate inverse Hessian" 156 << " of size: " << num_parameters_ << " x " << num_parameters_ 157 << ", consider using the L-BFGS memory-efficient line search direction " 158 << "instead."; 159 // Construct inverse_hessian_ after logging warning about size s.t. if the 160 // allocation crashes us, the log will highlight what the issue likely was. 161 inverse_hessian_ = Matrix::Identity(num_parameters, num_parameters); 162 } 163 164 virtual ~BFGS() {} 165 166 bool NextDirection(const LineSearchMinimizer::State& previous, 167 const LineSearchMinimizer::State& current, 168 Vector* search_direction) { 169 CHECK(is_positive_definite_) 170 << "Ceres bug: NextDirection() called on BFGS after inverse Hessian " 171 << "approximation has become indefinite, please contact the " 172 << "developers!"; 173 174 const Vector delta_x = previous.search_direction * previous.step_size; 175 const Vector delta_gradient = current.gradient - previous.gradient; 176 const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); 177 178 // The (L)BFGS algorithm explicitly requires that the secant equation: 179 // 180 // B_{k+1} * s_k = y_k 181 // 182 // Is satisfied at each iteration, where B_{k+1} is the approximated 183 // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and 184 // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be 185 // positive definite, this is equivalent to the condition: 186 // 187 // s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0] 188 // 189 // This condition would always be satisfied if the function was strictly 190 // convex, alternatively, it is always satisfied provided that a Wolfe line 191 // search is used (even if the function is not strictly convex). See [1] 192 // (p138) for a proof. 193 // 194 // Although Ceres will always use a Wolfe line search when using (L)BFGS, 195 // practical implementation considerations mean that the line search 196 // may return a point that satisfies only the Armijo condition, and thus 197 // could violate the Secant equation. As such, we will only use a step 198 // to update the Hessian approximation if: 199 // 200 // s_k^T * y_k > tolerance 201 // 202 // It is important that tolerance is very small (and >=0), as otherwise we 203 // might skip the update too often and fail to capture important curvature 204 // information in the Hessian. For example going from 1e-10 -> 1e-14 205 // improves the NIST benchmark score from 43/54 to 53/54. 206 // 207 // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999. 208 // 209 // TODO(alexs.mac): Consider using Damped BFGS update instead of 210 // skipping update. 211 const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14; 212 if (delta_x_dot_delta_gradient <= 213 kBFGSSecantConditionHessianUpdateTolerance) { 214 VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too " 215 << "small: " << delta_x_dot_delta_gradient << ", tolerance: " 216 << kBFGSSecantConditionHessianUpdateTolerance 217 << " (Secant condition)."; 218 } else { 219 // Update dense inverse Hessian approximation. 220 221 if (!initialized_ && use_approximate_eigenvalue_scaling_) { 222 // Rescale the initial inverse Hessian approximation (H_0) to be 223 // iteratively updated so that it is of similar 'size' to the true 224 // inverse Hessian at the start point. As shown in [1]: 225 // 226 // \gamma = (delta_gradient_{0}' * delta_x_{0}) / 227 // (delta_gradient_{0}' * delta_gradient_{0}) 228 // 229 // Satisfies: 230 // 231 // (1 / \lambda_m) <= \gamma <= (1 / \lambda_1) 232 // 233 // Where \lambda_1 & \lambda_m are the smallest and largest eigenvalues 234 // of the true initial Hessian (not the inverse) respectively. Thus, 235 // \gamma is an approximate eigenvalue of the true inverse Hessian, and 236 // choosing: H_0 = I * \gamma will yield a starting point that has a 237 // similar scale to the true inverse Hessian. This technique is widely 238 // reported to often improve convergence, however this is not 239 // universally true, particularly if there are errors in the initial 240 // gradients, or if there are significant differences in the sensitivity 241 // of the problem to the parameters (i.e. the range of the magnitudes of 242 // the components of the gradient is large). 243 // 244 // The original origin of this rescaling trick is somewhat unclear, the 245 // earliest reference appears to be Oren [1], however it is widely 246 // discussed without specific attributation in various texts including 247 // [2] (p143). 248 // 249 // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms 250 // Part II: Implementation and experiments, Management Science, 251 // 20(5), 863-874, 1974. 252 // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999. 253 const double approximate_eigenvalue_scale = 254 delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient); 255 inverse_hessian_ *= approximate_eigenvalue_scale; 256 257 VLOG(4) << "Applying approximate_eigenvalue_scale: " 258 << approximate_eigenvalue_scale << " to initial inverse " 259 << "Hessian approximation."; 260 } 261 initialized_ = true; 262 263 // Efficient O(num_parameters^2) BFGS update [2]. 264 // 265 // Starting from dense BFGS update detailed in Nocedal [2] p140/177 and 266 // using: y_k = delta_gradient, s_k = delta_x: 267 // 268 // \rho_k = 1.0 / (s_k' * y_k) 269 // V_k = I - \rho_k * y_k * s_k' 270 // H_k = (V_k' * H_{k-1} * V_k) + (\rho_k * s_k * s_k') 271 // 272 // This update involves matrix, matrix products which naively O(N^3), 273 // however we can exploit our knowledge that H_k is positive definite 274 // and thus by defn. symmetric to reduce the cost of the update: 275 // 276 // Expanding the update above yields: 277 // 278 // H_k = H_{k-1} + 279 // \rho_k * ( (1.0 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' - 280 // (s_k * y_k' * H_k + H_k * y_k * s_k') ) 281 // 282 // Using: A = (s_k * y_k' * H_k), and the knowledge that H_k = H_k', the 283 // last term simplifies to (A + A'). Note that although A is not symmetric 284 // (A + A') is symmetric. For ease of construction we also define 285 // B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k', which is by defn 286 // symmetric due to construction from: s_k * s_k'. 287 // 288 // Now we can write the BFGS update as: 289 // 290 // H_k = H_{k-1} + \rho_k * (B - (A + A')) 291 292 // For efficiency, as H_k is by defn. symmetric, we will only maintain the 293 // *lower* triangle of H_k (and all intermediary terms). 294 295 const double rho_k = 1.0 / delta_x_dot_delta_gradient; 296 297 // Calculate: A = s_k * y_k' * H_k 298 Matrix A = delta_x * (delta_gradient.transpose() * 299 inverse_hessian_.selfadjointView<Eigen::Lower>()); 300 301 // Calculate scalar: (1 + \rho_k * y_k' * H_k * y_k) 302 const double delta_x_times_delta_x_transpose_scale_factor = 303 (1.0 + (rho_k * delta_gradient.transpose() * 304 inverse_hessian_.selfadjointView<Eigen::Lower>() * 305 delta_gradient)); 306 // Calculate: B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' 307 Matrix B = Matrix::Zero(num_parameters_, num_parameters_); 308 B.selfadjointView<Eigen::Lower>(). 309 rankUpdate(delta_x, delta_x_times_delta_x_transpose_scale_factor); 310 311 // Finally, update inverse Hessian approximation according to: 312 // H_k = H_{k-1} + \rho_k * (B - (A + A')). Note that (A + A') is 313 // symmetric, even though A is not. 314 inverse_hessian_.triangularView<Eigen::Lower>() += 315 rho_k * (B - A - A.transpose()); 316 } 317 318 *search_direction = 319 inverse_hessian_.selfadjointView<Eigen::Lower>() * 320 (-1.0 * current.gradient); 321 322 if (search_direction->dot(current.gradient) >= 0.0) { 323 LOG(WARNING) << "Numerical failure in BFGS update: inverse Hessian " 324 << "approximation is not positive definite, and thus " 325 << "initial gradient for search direction is positive: " 326 << search_direction->dot(current.gradient); 327 is_positive_definite_ = false; 328 return false; 329 } 330 331 return true; 332 } 333 334 private: 335 const int num_parameters_; 336 const bool use_approximate_eigenvalue_scaling_; 337 Matrix inverse_hessian_; 338 bool initialized_; 339 bool is_positive_definite_; 340 }; 341 342 LineSearchDirection* 343 LineSearchDirection::Create(const LineSearchDirection::Options& options) { 344 if (options.type == STEEPEST_DESCENT) { 345 return new SteepestDescent; 346 } 347 348 if (options.type == NONLINEAR_CONJUGATE_GRADIENT) { 349 return new NonlinearConjugateGradient( 350 options.nonlinear_conjugate_gradient_type, 351 options.function_tolerance); 352 } 353 354 if (options.type == ceres::LBFGS) { 355 return new ceres::internal::LBFGS( 356 options.num_parameters, 357 options.max_lbfgs_rank, 358 options.use_approximate_eigenvalue_bfgs_scaling); 359 } 360 361 if (options.type == ceres::BFGS) { 362 return new ceres::internal::BFGS( 363 options.num_parameters, 364 options.use_approximate_eigenvalue_bfgs_scaling); 365 } 366 367 LOG(ERROR) << "Unknown line search direction type: " << options.type; 368 return NULL; 369 } 370 371 } // namespace internal 372 } // namespace ceres 373