Home | History | Annotate | Download | only in ceres
      1 // Ceres Solver - A fast non-linear least squares minimizer
      2 // Copyright 2010, 2011, 2012 Google Inc. All rights reserved.
      3 // http://code.google.com/p/ceres-solver/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are met:
      7 //
      8 // * Redistributions of source code must retain the above copyright notice,
      9 //   this list of conditions and the following disclaimer.
     10 // * Redistributions in binary form must reproduce the above copyright notice,
     11 //   this list of conditions and the following disclaimer in the documentation
     12 //   and/or other materials provided with the distribution.
     13 // * Neither the name of Google Inc. nor the names of its contributors may be
     14 //   used to endorse or promote products derived from this software without
     15 //   specific prior written permission.
     16 //
     17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     18 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     21 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     22 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     23 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     24 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     25 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     26 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 //
     29 // Author: sameeragarwal (at) google.com (Sameer Agarwal)
     30 //
     31 // TODO(sameeragarwal): row_block_counter can perhaps be replaced by
     32 // Chunk::start ?
     33 
     34 #ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_
     35 #define CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_
     36 
     37 // Eigen has an internal threshold switching between different matrix
     38 // multiplication algorithms. In particular for matrices larger than
     39 // EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD it uses a cache friendly
     40 // matrix matrix product algorithm that has a higher setup cost. For
     41 // matrix sizes close to this threshold, especially when the matrices
     42 // are thin and long, the default choice may not be optimal. This is
     43 // the case for us, as the default choice causes a 30% performance
     44 // regression when we moved from Eigen2 to Eigen3.
     45 
     46 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 10
     47 
     48 #ifdef CERES_USE_OPENMP
     49 #include <omp.h>
     50 #endif
     51 
     52 #include <algorithm>
     53 #include <map>
     54 #include "ceres/block_random_access_matrix.h"
     55 #include "ceres/block_sparse_matrix.h"
     56 #include "ceres/block_structure.h"
     57 #include "ceres/internal/eigen.h"
     58 #include "ceres/internal/fixed_array.h"
     59 #include "ceres/internal/scoped_ptr.h"
     60 #include "ceres/map_util.h"
     61 #include "ceres/schur_eliminator.h"
     62 #include "ceres/small_blas.h"
     63 #include "ceres/stl_util.h"
     64 #include "Eigen/Dense"
     65 #include "glog/logging.h"
     66 
     67 namespace ceres {
     68 namespace internal {
     69 
     70 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
     71 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::~SchurEliminator() {
     72   STLDeleteElements(&rhs_locks_);
     73 }
     74 
     75 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
     76 void
     77 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
     78 Init(int num_eliminate_blocks, const CompressedRowBlockStructure* bs) {
     79   CHECK_GT(num_eliminate_blocks, 0)
     80       << "SchurComplementSolver cannot be initialized with "
     81       << "num_eliminate_blocks = 0.";
     82 
     83   num_eliminate_blocks_ = num_eliminate_blocks;
     84 
     85   const int num_col_blocks = bs->cols.size();
     86   const int num_row_blocks = bs->rows.size();
     87 
     88   buffer_size_ = 1;
     89   chunks_.clear();
     90   lhs_row_layout_.clear();
     91 
     92   int lhs_num_rows = 0;
     93   // Add a map object for each block in the reduced linear system
     94   // and build the row/column block structure of the reduced linear
     95   // system.
     96   lhs_row_layout_.resize(num_col_blocks - num_eliminate_blocks_);
     97   for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
     98     lhs_row_layout_[i - num_eliminate_blocks_] = lhs_num_rows;
     99     lhs_num_rows += bs->cols[i].size;
    100   }
    101 
    102   int r = 0;
    103   // Iterate over the row blocks of A, and detect the chunks. The
    104   // matrix should already have been ordered so that all rows
    105   // containing the same y block are vertically contiguous. Along
    106   // the way also compute the amount of space each chunk will need
    107   // to perform the elimination.
    108   while (r < num_row_blocks) {
    109     const int chunk_block_id = bs->rows[r].cells.front().block_id;
    110     if (chunk_block_id >= num_eliminate_blocks_) {
    111       break;
    112     }
    113 
    114     chunks_.push_back(Chunk());
    115     Chunk& chunk = chunks_.back();
    116     chunk.size = 0;
    117     chunk.start = r;
    118     int buffer_size = 0;
    119     const int e_block_size = bs->cols[chunk_block_id].size;
    120 
    121     // Add to the chunk until the first block in the row is
    122     // different than the one in the first row for the chunk.
    123     while (r + chunk.size < num_row_blocks) {
    124       const CompressedRow& row = bs->rows[r + chunk.size];
    125       if (row.cells.front().block_id != chunk_block_id) {
    126         break;
    127       }
    128 
    129       // Iterate over the blocks in the row, ignoring the first
    130       // block since it is the one to be eliminated.
    131       for (int c = 1; c < row.cells.size(); ++c) {
    132         const Cell& cell = row.cells[c];
    133         if (InsertIfNotPresent(
    134                 &(chunk.buffer_layout), cell.block_id, buffer_size)) {
    135           buffer_size += e_block_size * bs->cols[cell.block_id].size;
    136         }
    137       }
    138 
    139       buffer_size_ = max(buffer_size, buffer_size_);
    140       ++chunk.size;
    141     }
    142 
    143     CHECK_GT(chunk.size, 0);
    144     r += chunk.size;
    145   }
    146   const Chunk& chunk = chunks_.back();
    147 
    148   uneliminated_row_begins_ = chunk.start + chunk.size;
    149   if (num_threads_ > 1) {
    150     random_shuffle(chunks_.begin(), chunks_.end());
    151   }
    152 
    153   buffer_.reset(new double[buffer_size_ * num_threads_]);
    154 
    155   // chunk_outer_product_buffer_ only needs to store e_block_size *
    156   // f_block_size, which is always less than buffer_size_, so we just
    157   // allocate buffer_size_ per thread.
    158   chunk_outer_product_buffer_.reset(new double[buffer_size_ * num_threads_]);
    159 
    160   STLDeleteElements(&rhs_locks_);
    161   rhs_locks_.resize(num_col_blocks - num_eliminate_blocks_);
    162   for (int i = 0; i < num_col_blocks - num_eliminate_blocks_; ++i) {
    163     rhs_locks_[i] = new Mutex;
    164   }
    165 }
    166 
    167 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    168 void
    169 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    170 Eliminate(const BlockSparseMatrix* A,
    171           const double* b,
    172           const double* D,
    173           BlockRandomAccessMatrix* lhs,
    174           double* rhs) {
    175   if (lhs->num_rows() > 0) {
    176     lhs->SetZero();
    177     VectorRef(rhs, lhs->num_rows()).setZero();
    178   }
    179 
    180   const CompressedRowBlockStructure* bs = A->block_structure();
    181   const int num_col_blocks = bs->cols.size();
    182 
    183   // Add the diagonal to the schur complement.
    184   if (D != NULL) {
    185 #pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
    186     for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
    187       const int block_id = i - num_eliminate_blocks_;
    188       int r, c, row_stride, col_stride;
    189       CellInfo* cell_info = lhs->GetCell(block_id, block_id,
    190                                          &r, &c,
    191                                          &row_stride, &col_stride);
    192       if (cell_info != NULL) {
    193         const int block_size = bs->cols[i].size;
    194         typename EigenTypes<kFBlockSize>::ConstVectorRef
    195             diag(D + bs->cols[i].position, block_size);
    196 
    197         CeresMutexLock l(&cell_info->m);
    198         MatrixRef m(cell_info->values, row_stride, col_stride);
    199         m.block(r, c, block_size, block_size).diagonal()
    200             += diag.array().square().matrix();
    201       }
    202     }
    203   }
    204 
    205   // Eliminate y blocks one chunk at a time.  For each chunk,x3
    206   // compute the entries of the normal equations and the gradient
    207   // vector block corresponding to the y block and then apply
    208   // Gaussian elimination to them. The matrix ete stores the normal
    209   // matrix corresponding to the block being eliminated and array
    210   // buffer_ contains the non-zero blocks in the row corresponding
    211   // to this y block in the normal equations. This computation is
    212   // done in ChunkDiagonalBlockAndGradient. UpdateRhs then applies
    213   // gaussian elimination to the rhs of the normal equations,
    214   // updating the rhs of the reduced linear system by modifying rhs
    215   // blocks for all the z blocks that share a row block/residual
    216   // term with the y block. EliminateRowOuterProduct does the
    217   // corresponding operation for the lhs of the reduced linear
    218   // system.
    219 #pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
    220   for (int i = 0; i < chunks_.size(); ++i) {
    221 #ifdef CERES_USE_OPENMP
    222     int thread_id = omp_get_thread_num();
    223 #else
    224     int thread_id = 0;
    225 #endif
    226     double* buffer = buffer_.get() + thread_id * buffer_size_;
    227     const Chunk& chunk = chunks_[i];
    228     const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
    229     const int e_block_size = bs->cols[e_block_id].size;
    230 
    231     VectorRef(buffer, buffer_size_).setZero();
    232 
    233     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
    234         ete(e_block_size, e_block_size);
    235 
    236     if (D != NULL) {
    237       const typename EigenTypes<kEBlockSize>::ConstVectorRef
    238           diag(D + bs->cols[e_block_id].position, e_block_size);
    239       ete = diag.array().square().matrix().asDiagonal();
    240     } else {
    241       ete.setZero();
    242     }
    243 
    244     FixedArray<double, 8> g(e_block_size);
    245     typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
    246     gref.setZero();
    247 
    248     // We are going to be computing
    249     //
    250     //   S += F'F - F'E(E'E)^{-1}E'F
    251     //
    252     // for each Chunk. The computation is broken down into a number of
    253     // function calls as below.
    254 
    255     // Compute the outer product of the e_blocks with themselves (ete
    256     // = E'E). Compute the product of the e_blocks with the
    257     // corresonding f_blocks (buffer = E'F), the gradient of the terms
    258     // in this chunk (g) and add the outer product of the f_blocks to
    259     // Schur complement (S += F'F).
    260     ChunkDiagonalBlockAndGradient(
    261         chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
    262 
    263     // Normally one wouldn't compute the inverse explicitly, but
    264     // e_block_size will typically be a small number like 3, in
    265     // which case its much faster to compute the inverse once and
    266     // use it to multiply other matrices/vectors instead of doing a
    267     // Solve call over and over again.
    268     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete =
    269         ete
    270         .template selfadjointView<Eigen::Upper>()
    271         .llt()
    272         .solve(Matrix::Identity(e_block_size, e_block_size));
    273 
    274     // For the current chunk compute and update the rhs of the reduced
    275     // linear system.
    276     //
    277     //   rhs = F'b - F'E(E'E)^(-1) E'b
    278 
    279     FixedArray<double, 8> inverse_ete_g(e_block_size);
    280     MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
    281         inverse_ete.data(),
    282         e_block_size,
    283         e_block_size,
    284         g.get(),
    285         inverse_ete_g.get());
    286 
    287     UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
    288 
    289     // S -= F'E(E'E)^{-1}E'F
    290     ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
    291   }
    292 
    293   // For rows with no e_blocks, the schur complement update reduces to
    294   // S += F'F.
    295   NoEBlockRowsUpdate(A, b,  uneliminated_row_begins_, lhs, rhs);
    296 }
    297 
    298 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    299 void
    300 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    301 BackSubstitute(const BlockSparseMatrix* A,
    302                const double* b,
    303                const double* D,
    304                const double* z,
    305                double* y) {
    306   const CompressedRowBlockStructure* bs = A->block_structure();
    307 #pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
    308   for (int i = 0; i < chunks_.size(); ++i) {
    309     const Chunk& chunk = chunks_[i];
    310     const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
    311     const int e_block_size = bs->cols[e_block_id].size;
    312 
    313     double* y_ptr = y +  bs->cols[e_block_id].position;
    314     typename EigenTypes<kEBlockSize>::VectorRef y_block(y_ptr, e_block_size);
    315 
    316     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
    317         ete(e_block_size, e_block_size);
    318     if (D != NULL) {
    319       const typename EigenTypes<kEBlockSize>::ConstVectorRef
    320           diag(D + bs->cols[e_block_id].position, e_block_size);
    321       ete = diag.array().square().matrix().asDiagonal();
    322     } else {
    323       ete.setZero();
    324     }
    325 
    326     const double* values = A->values();
    327     for (int j = 0; j < chunk.size; ++j) {
    328       const CompressedRow& row = bs->rows[chunk.start + j];
    329       const Cell& e_cell = row.cells.front();
    330       DCHECK_EQ(e_block_id, e_cell.block_id);
    331 
    332       FixedArray<double, 8> sj(row.block.size);
    333 
    334       typename EigenTypes<kRowBlockSize>::VectorRef(sj.get(), row.block.size) =
    335           typename EigenTypes<kRowBlockSize>::ConstVectorRef
    336           (b + bs->rows[chunk.start + j].block.position, row.block.size);
    337 
    338       for (int c = 1; c < row.cells.size(); ++c) {
    339         const int f_block_id = row.cells[c].block_id;
    340         const int f_block_size = bs->cols[f_block_id].size;
    341         const int r_block = f_block_id - num_eliminate_blocks_;
    342 
    343         MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>(
    344             values + row.cells[c].position, row.block.size, f_block_size,
    345             z + lhs_row_layout_[r_block],
    346             sj.get());
    347       }
    348 
    349       MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
    350           values + e_cell.position, row.block.size, e_block_size,
    351           sj.get(),
    352           y_ptr);
    353 
    354       MatrixTransposeMatrixMultiply
    355           <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
    356               values + e_cell.position, row.block.size, e_block_size,
    357               values + e_cell.position, row.block.size, e_block_size,
    358               ete.data(), 0, 0, e_block_size, e_block_size);
    359     }
    360 
    361     ete.llt().solveInPlace(y_block);
    362   }
    363 }
    364 
    365 // Update the rhs of the reduced linear system. Compute
    366 //
    367 //   F'b - F'E(E'E)^(-1) E'b
    368 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    369 void
    370 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    371 UpdateRhs(const Chunk& chunk,
    372           const BlockSparseMatrix* A,
    373           const double* b,
    374           int row_block_counter,
    375           const double* inverse_ete_g,
    376           double* rhs) {
    377   const CompressedRowBlockStructure* bs = A->block_structure();
    378   const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
    379   const int e_block_size = bs->cols[e_block_id].size;
    380 
    381   int b_pos = bs->rows[row_block_counter].block.position;
    382   const double* values = A->values();
    383   for (int j = 0; j < chunk.size; ++j) {
    384     const CompressedRow& row = bs->rows[row_block_counter + j];
    385     const Cell& e_cell = row.cells.front();
    386 
    387     typename EigenTypes<kRowBlockSize>::Vector sj =
    388         typename EigenTypes<kRowBlockSize>::ConstVectorRef
    389         (b + b_pos, row.block.size);
    390 
    391     MatrixVectorMultiply<kRowBlockSize, kEBlockSize, -1>(
    392         values + e_cell.position, row.block.size, e_block_size,
    393         inverse_ete_g, sj.data());
    394 
    395     for (int c = 1; c < row.cells.size(); ++c) {
    396       const int block_id = row.cells[c].block_id;
    397       const int block_size = bs->cols[block_id].size;
    398       const int block = block_id - num_eliminate_blocks_;
    399       CeresMutexLock l(rhs_locks_[block]);
    400       MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
    401           values + row.cells[c].position,
    402           row.block.size, block_size,
    403           sj.data(), rhs + lhs_row_layout_[block]);
    404     }
    405     b_pos += row.block.size;
    406   }
    407 }
    408 
    409 // Given a Chunk - set of rows with the same e_block, e.g. in the
    410 // following Chunk with two rows.
    411 //
    412 //                E                   F
    413 //      [ y11   0   0   0 |  z11     0    0   0    z51]
    414 //      [ y12   0   0   0 |  z12   z22    0   0      0]
    415 //
    416 // this function computes twp matrices. The diagonal block matrix
    417 //
    418 //   ete = y11 * y11' + y12 * y12'
    419 //
    420 // and the off diagonal blocks in the Guass Newton Hessian.
    421 //
    422 //   buffer = [y11'(z11 + z12), y12' * z22, y11' * z51]
    423 //
    424 // which are zero compressed versions of the block sparse matrices E'E
    425 // and E'F.
    426 //
    427 // and the gradient of the e_block, E'b.
    428 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    429 void
    430 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    431 ChunkDiagonalBlockAndGradient(
    432     const Chunk& chunk,
    433     const BlockSparseMatrix* A,
    434     const double* b,
    435     int row_block_counter,
    436     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* ete,
    437     double* g,
    438     double* buffer,
    439     BlockRandomAccessMatrix* lhs) {
    440   const CompressedRowBlockStructure* bs = A->block_structure();
    441 
    442   int b_pos = bs->rows[row_block_counter].block.position;
    443   const int e_block_size = ete->rows();
    444 
    445   // Iterate over the rows in this chunk, for each row, compute the
    446   // contribution of its F blocks to the Schur complement, the
    447   // contribution of its E block to the matrix EE' (ete), and the
    448   // corresponding block in the gradient vector.
    449   const double* values = A->values();
    450   for (int j = 0; j < chunk.size; ++j) {
    451     const CompressedRow& row = bs->rows[row_block_counter + j];
    452 
    453     if (row.cells.size() > 1) {
    454       EBlockRowOuterProduct(A, row_block_counter + j, lhs);
    455     }
    456 
    457     // Extract the e_block, ETE += E_i' E_i
    458     const Cell& e_cell = row.cells.front();
    459     MatrixTransposeMatrixMultiply
    460         <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
    461             values + e_cell.position, row.block.size, e_block_size,
    462             values + e_cell.position, row.block.size, e_block_size,
    463             ete->data(), 0, 0, e_block_size, e_block_size);
    464 
    465     // g += E_i' b_i
    466     MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
    467         values + e_cell.position, row.block.size, e_block_size,
    468         b + b_pos,
    469         g);
    470 
    471 
    472     // buffer = E'F. This computation is done by iterating over the
    473     // f_blocks for each row in the chunk.
    474     for (int c = 1; c < row.cells.size(); ++c) {
    475       const int f_block_id = row.cells[c].block_id;
    476       const int f_block_size = bs->cols[f_block_id].size;
    477       double* buffer_ptr =
    478           buffer +  FindOrDie(chunk.buffer_layout, f_block_id);
    479       MatrixTransposeMatrixMultiply
    480           <kRowBlockSize, kEBlockSize, kRowBlockSize, kFBlockSize, 1>(
    481           values + e_cell.position, row.block.size, e_block_size,
    482           values + row.cells[c].position, row.block.size, f_block_size,
    483           buffer_ptr, 0, 0, e_block_size, f_block_size);
    484     }
    485     b_pos += row.block.size;
    486   }
    487 }
    488 
    489 // Compute the outer product F'E(E'E)^{-1}E'F and subtract it from the
    490 // Schur complement matrix, i.e
    491 //
    492 //  S -= F'E(E'E)^{-1}E'F.
    493 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    494 void
    495 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    496 ChunkOuterProduct(const CompressedRowBlockStructure* bs,
    497                   const Matrix& inverse_ete,
    498                   const double* buffer,
    499                   const BufferLayoutType& buffer_layout,
    500                   BlockRandomAccessMatrix* lhs) {
    501   // This is the most computationally expensive part of this
    502   // code. Profiling experiments reveal that the bottleneck is not the
    503   // computation of the right-hand matrix product, but memory
    504   // references to the left hand side.
    505   const int e_block_size = inverse_ete.rows();
    506   BufferLayoutType::const_iterator it1 = buffer_layout.begin();
    507 
    508 #ifdef CERES_USE_OPENMP
    509   int thread_id = omp_get_thread_num();
    510 #else
    511   int thread_id = 0;
    512 #endif
    513   double* b1_transpose_inverse_ete =
    514       chunk_outer_product_buffer_.get() + thread_id * buffer_size_;
    515 
    516   // S(i,j) -= bi' * ete^{-1} b_j
    517   for (; it1 != buffer_layout.end(); ++it1) {
    518     const int block1 = it1->first - num_eliminate_blocks_;
    519     const int block1_size = bs->cols[it1->first].size;
    520     MatrixTransposeMatrixMultiply
    521         <kEBlockSize, kFBlockSize, kEBlockSize, kEBlockSize, 0>(
    522         buffer + it1->second, e_block_size, block1_size,
    523         inverse_ete.data(), e_block_size, e_block_size,
    524         b1_transpose_inverse_ete, 0, 0, block1_size, e_block_size);
    525 
    526     BufferLayoutType::const_iterator it2 = it1;
    527     for (; it2 != buffer_layout.end(); ++it2) {
    528       const int block2 = it2->first - num_eliminate_blocks_;
    529 
    530       int r, c, row_stride, col_stride;
    531       CellInfo* cell_info = lhs->GetCell(block1, block2,
    532                                          &r, &c,
    533                                          &row_stride, &col_stride);
    534       if (cell_info != NULL) {
    535         const int block2_size = bs->cols[it2->first].size;
    536         CeresMutexLock l(&cell_info->m);
    537         MatrixMatrixMultiply
    538             <kFBlockSize, kEBlockSize, kEBlockSize, kFBlockSize, -1>(
    539                 b1_transpose_inverse_ete, block1_size, e_block_size,
    540                 buffer  + it2->second, e_block_size, block2_size,
    541                 cell_info->values, r, c, row_stride, col_stride);
    542       }
    543     }
    544   }
    545 }
    546 
    547 // For rows with no e_blocks, the schur complement update reduces to S
    548 // += F'F. This function iterates over the rows of A with no e_block,
    549 // and calls NoEBlockRowOuterProduct on each row.
    550 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    551 void
    552 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    553 NoEBlockRowsUpdate(const BlockSparseMatrix* A,
    554                    const double* b,
    555                    int row_block_counter,
    556                    BlockRandomAccessMatrix* lhs,
    557                    double* rhs) {
    558   const CompressedRowBlockStructure* bs = A->block_structure();
    559   const double* values = A->values();
    560   for (; row_block_counter < bs->rows.size(); ++row_block_counter) {
    561     const CompressedRow& row = bs->rows[row_block_counter];
    562     for (int c = 0; c < row.cells.size(); ++c) {
    563       const int block_id = row.cells[c].block_id;
    564       const int block_size = bs->cols[block_id].size;
    565       const int block = block_id - num_eliminate_blocks_;
    566       MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
    567           values + row.cells[c].position, row.block.size, block_size,
    568           b + row.block.position,
    569           rhs + lhs_row_layout_[block]);
    570     }
    571     NoEBlockRowOuterProduct(A, row_block_counter, lhs);
    572   }
    573 }
    574 
    575 
    576 // A row r of A, which has no e_blocks gets added to the Schur
    577 // Complement as S += r r'. This function is responsible for computing
    578 // the contribution of a single row r to the Schur complement. It is
    579 // very similar in structure to EBlockRowOuterProduct except for
    580 // one difference. It does not use any of the template
    581 // parameters. This is because the algorithm used for detecting the
    582 // static structure of the matrix A only pays attention to rows with
    583 // e_blocks. This is becase rows without e_blocks are rare and
    584 // typically arise from regularization terms in the original
    585 // optimization problem, and have a very different structure than the
    586 // rows with e_blocks. Including them in the static structure
    587 // detection will lead to most template parameters being set to
    588 // dynamic. Since the number of rows without e_blocks is small, the
    589 // lack of templating is not an issue.
    590 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    591 void
    592 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    593 NoEBlockRowOuterProduct(const BlockSparseMatrix* A,
    594                      int row_block_index,
    595                      BlockRandomAccessMatrix* lhs) {
    596   const CompressedRowBlockStructure* bs = A->block_structure();
    597   const CompressedRow& row = bs->rows[row_block_index];
    598   const double* values = A->values();
    599   for (int i = 0; i < row.cells.size(); ++i) {
    600     const int block1 = row.cells[i].block_id - num_eliminate_blocks_;
    601     DCHECK_GE(block1, 0);
    602 
    603     const int block1_size = bs->cols[row.cells[i].block_id].size;
    604     int r, c, row_stride, col_stride;
    605     CellInfo* cell_info = lhs->GetCell(block1, block1,
    606                                        &r, &c,
    607                                        &row_stride, &col_stride);
    608     if (cell_info != NULL) {
    609       CeresMutexLock l(&cell_info->m);
    610       // This multiply currently ignores the fact that this is a
    611       // symmetric outer product.
    612       MatrixTransposeMatrixMultiply
    613           <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
    614               values + row.cells[i].position, row.block.size, block1_size,
    615               values + row.cells[i].position, row.block.size, block1_size,
    616               cell_info->values, r, c, row_stride, col_stride);
    617     }
    618 
    619     for (int j = i + 1; j < row.cells.size(); ++j) {
    620       const int block2 = row.cells[j].block_id - num_eliminate_blocks_;
    621       DCHECK_GE(block2, 0);
    622       DCHECK_LT(block1, block2);
    623       int r, c, row_stride, col_stride;
    624       CellInfo* cell_info = lhs->GetCell(block1, block2,
    625                                          &r, &c,
    626                                          &row_stride, &col_stride);
    627       if (cell_info != NULL) {
    628         const int block2_size = bs->cols[row.cells[j].block_id].size;
    629         CeresMutexLock l(&cell_info->m);
    630         MatrixTransposeMatrixMultiply
    631             <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
    632                 values + row.cells[i].position, row.block.size, block1_size,
    633                 values + row.cells[j].position, row.block.size, block2_size,
    634                 cell_info->values, r, c, row_stride, col_stride);
    635       }
    636     }
    637   }
    638 }
    639 
    640 // For a row with an e_block, compute the contribition S += F'F. This
    641 // function has the same structure as NoEBlockRowOuterProduct, except
    642 // that this function uses the template parameters.
    643 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
    644 void
    645 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    646 EBlockRowOuterProduct(const BlockSparseMatrix* A,
    647                       int row_block_index,
    648                       BlockRandomAccessMatrix* lhs) {
    649   const CompressedRowBlockStructure* bs = A->block_structure();
    650   const CompressedRow& row = bs->rows[row_block_index];
    651   const double* values = A->values();
    652   for (int i = 1; i < row.cells.size(); ++i) {
    653     const int block1 = row.cells[i].block_id - num_eliminate_blocks_;
    654     DCHECK_GE(block1, 0);
    655 
    656     const int block1_size = bs->cols[row.cells[i].block_id].size;
    657     int r, c, row_stride, col_stride;
    658     CellInfo* cell_info = lhs->GetCell(block1, block1,
    659                                        &r, &c,
    660                                        &row_stride, &col_stride);
    661     if (cell_info != NULL) {
    662       CeresMutexLock l(&cell_info->m);
    663       // block += b1.transpose() * b1;
    664       MatrixTransposeMatrixMultiply
    665           <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
    666           values + row.cells[i].position, row.block.size, block1_size,
    667           values + row.cells[i].position, row.block.size, block1_size,
    668           cell_info->values, r, c, row_stride, col_stride);
    669     }
    670 
    671     for (int j = i + 1; j < row.cells.size(); ++j) {
    672       const int block2 = row.cells[j].block_id - num_eliminate_blocks_;
    673       DCHECK_GE(block2, 0);
    674       DCHECK_LT(block1, block2);
    675       const int block2_size = bs->cols[row.cells[j].block_id].size;
    676       int r, c, row_stride, col_stride;
    677       CellInfo* cell_info = lhs->GetCell(block1, block2,
    678                                          &r, &c,
    679                                          &row_stride, &col_stride);
    680       if (cell_info != NULL) {
    681         // block += b1.transpose() * b2;
    682         CeresMutexLock l(&cell_info->m);
    683         MatrixTransposeMatrixMultiply
    684             <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
    685                 values + row.cells[i].position, row.block.size, block1_size,
    686                 values + row.cells[j].position, row.block.size, block2_size,
    687                 cell_info->values, r, c, row_stride, col_stride);
    688       }
    689     }
    690   }
    691 }
    692 
    693 }  // namespace internal
    694 }  // namespace ceres
    695 
    696 #endif  // CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_
    697