Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
     17 #define TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
     18 
     19 #define EIGEN_USE_THREADS
     20 
     21 #include <stddef.h>
     22 #include <algorithm>
     23 #include <cmath>
     24 #include <memory>
     25 #include <new>
     26 #include <unordered_map>
     27 #include <utility>
     28 #include <vector>
     29 
     30 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     31 #include "tensorflow/core/framework/device_base.h"
     32 #include "tensorflow/core/framework/op_kernel.h"
     33 #include "tensorflow/core/framework/tensor.h"
     34 #include "tensorflow/core/framework/tensor_shape.h"
     35 #include "tensorflow/core/framework/tensor_types.h"
     36 #include "tensorflow/core/framework/types.h"
     37 #include "tensorflow/core/kernels/loss.h"
     38 #include "tensorflow/core/lib/core/coding.h"
     39 #include "tensorflow/core/lib/core/errors.h"
     40 #include "tensorflow/core/lib/core/status.h"
     41 #include "tensorflow/core/lib/core/stringpiece.h"
     42 #include "tensorflow/core/lib/gtl/inlined_vector.h"
     43 #include "tensorflow/core/lib/random/distribution_sampler.h"
     44 #include "tensorflow/core/lib/strings/stringprintf.h"
     45 #include "tensorflow/core/util/guarded_philox_random.h"
     46 #include "tensorflow/core/util/sparse/group_iterator.h"
     47 #include "tensorflow/core/util/sparse/sparse_tensor.h"
     48 #include "tensorflow/core/util/work_sharder.h"
     49 
     50 namespace tensorflow {
     51 
     52 namespace sdca {
     53 
     54 // Statistics computed with input (ModelWeights, Example).
     55 struct ExampleStatistics {
     56   // Logits for each class.
     57   // For binary case, this should be a vector of length 1; while for multiclass
     58   // case, this vector has the same length as the number of classes, where each
     59   // value corresponds to one class.
     60   // Use InlinedVector to avoid heap allocation for small number of classes.
     61   gtl::InlinedVector<double, 1> wx;
     62 
     63   // Logits for each class, using the previous weights.
     64   gtl::InlinedVector<double, 1> prev_wx;
     65 
     66   // Sum of squared feature values occurring in the example divided by
     67   // L2 * sum(example_weights).
     68   double normalized_squared_norm = 0;
     69 
     70   // Num_weight_vectors equals to the number of classification classes in the
     71   // multiclass case; while for binary case, it is 1.
     72   ExampleStatistics(const int num_weight_vectors)
     73       : wx(num_weight_vectors, 0.0), prev_wx(num_weight_vectors, 0.0) {}
     74 };
     75 
     76 class Regularizations {
     77  public:
     78   Regularizations(){};
     79 
     80   // Initialize() must be called immediately after construction.
     81   Status Initialize(OpKernelConstruction* const context) {
     82     TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
     83     TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
     84     shrinkage_ = symmetric_l1_ / symmetric_l2_;
     85     return Status::OK();
     86   }
     87 
     88   // Proximal SDCA shrinking for L1 regularization.
     89   double Shrink(const double weight) const {
     90     const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0);
     91     if (shrinked > 0.0) {
     92       return std::copysign(shrinked, weight);
     93     }
     94     return 0.0;
     95   }
     96 
     97   // Vectorized float variant of the above.
     98   Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrinkVector(
     99       const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const {
    100     // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
    101     return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
    102                                  .cwiseMax(weights.constant(0.0)));
    103   }
    104 
    105   // Matrix float variant of the above.
    106   Eigen::Tensor<float, 2, Eigen::RowMajor> EigenShrinkMatrix(
    107       const Eigen::Tensor<float, 2, Eigen::RowMajor> weights) const {
    108     // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
    109     return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
    110                                  .cwiseMax(weights.constant(0.0)));
    111   }
    112 
    113   float symmetric_l2() const { return symmetric_l2_; }
    114 
    115  private:
    116   float symmetric_l1_ = 0;
    117   float symmetric_l2_ = 0;
    118 
    119   // L1 divided by L2, pre-computed for use during weight shrinking.
    120   double shrinkage_ = 0;
    121 
    122   TF_DISALLOW_COPY_AND_ASSIGN(Regularizations);
    123 };
    124 
    125 class ModelWeights;
    126 
    127 // Struct describing a single example.
    128 class Example {
    129  public:
    130   // Compute matrix vector product between weights (a matrix) and features
    131   // (a vector). This method also computes the normalized example norm used
    132   // in SDCA update.
    133   // For multiclass case, num_weight_vectors equals to the number of classes;
    134   // while for binary case, it is 1.
    135   const ExampleStatistics ComputeWxAndWeightedExampleNorm(
    136       const int num_loss_partitions, const ModelWeights& model_weights,
    137       const Regularizations& regularization,
    138       const int num_weight_vectors) const;
    139 
    140   float example_label() const { return example_label_; }
    141 
    142   float example_weight() const { return example_weight_; }
    143 
    144   double squared_norm() const { return squared_norm_; }
    145 
    146   // Sparse features associated with the example.
    147   // Indices and Values are the associated feature index, and values. Values
    148   // can be optionally absent, in which we case we implicitly assume a value of
    149   // 1.0f.
    150   struct SparseFeatures {
    151     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
    152     std::unique_ptr<TTypes<const float>::UnalignedConstVec>
    153         values;  // nullptr encodes optional.
    154   };
    155 
    156   // A dense vector which is a row-slice of the underlying matrix.
    157   struct DenseVector {
    158     // Returns a row slice from the matrix.
    159     Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> Row()
    160         const {
    161       return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
    162           data_matrix.data() + row_index * data_matrix.dimension(1),
    163           data_matrix.dimension(1));
    164     }
    165 
    166     // Returns a row slice as a 1 * F matrix, where F is the number of features.
    167     Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>
    168     RowAsMatrix() const {
    169       return Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>(
    170           data_matrix.data() + row_index * data_matrix.dimension(1), 1,
    171           data_matrix.dimension(1));
    172     }
    173 
    174     const TTypes<float>::ConstMatrix data_matrix;
    175     const int64 row_index;
    176   };
    177 
    178  private:
    179   std::vector<SparseFeatures> sparse_features_;
    180   std::vector<std::unique_ptr<DenseVector>> dense_vectors_;
    181 
    182   float example_label_ = 0;
    183   float example_weight_ = 0;
    184   double squared_norm_ = 0;  // sum squared norm of the features.
    185 
    186   // Examples fills Example in a multi-threaded way.
    187   friend class Examples;
    188 
    189   // ModelWeights use each example for model update w += \alpha * x_{i};
    190   friend class ModelWeights;
    191 };
    192 
    193 // Weights related to features. For example, say you have two sets of sparse
    194 // features i.e. age bracket and country, then FeatureWeightsDenseStorage hold
    195 // the parameters for it. We keep track of the original weight passed in and the
    196 // delta weight which the optimizer learns in each call to the optimizer.
    197 class FeatureWeightsDenseStorage {
    198  public:
    199   FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals,
    200                              TTypes<float>::Matrix deltas)
    201       : nominals_(nominals), deltas_(deltas) {
    202     CHECK(deltas.rank() > 1);
    203   }
    204 
    205   // Check if a feature index is with-in the bounds.
    206   bool IndexValid(const int64 index) const {
    207     return index >= 0 && index < deltas_.dimension(1);
    208   }
    209 
    210   // Nominals here are the original weight matrix.
    211   TTypes<const float>::Matrix nominals() const { return nominals_; }
    212 
    213   // Delta weights durining mini-batch updates.
    214   TTypes<float>::Matrix deltas() const { return deltas_; }
    215 
    216   // Updates delta weights based on active dense features in the example and
    217   // the corresponding dual residual.
    218   void UpdateDenseDeltaWeights(
    219       const Eigen::ThreadPoolDevice& device,
    220       const Example::DenseVector& dense_vector,
    221       const std::vector<double>& normalized_bounded_dual_delta);
    222 
    223  private:
    224   // The nominal value of the weight for a feature (indexed by its id).
    225   const TTypes<const float>::Matrix nominals_;
    226   // The accumulated delta weight for a feature (indexed by its id).
    227   TTypes<float>::Matrix deltas_;
    228 };
    229 
    230 // Similar to FeatureWeightsDenseStorage, but the underlying weights are stored
    231 // in an unordered map.
    232 class FeatureWeightsSparseStorage {
    233  public:
    234   FeatureWeightsSparseStorage(const TTypes<const int64>::Vec indices,
    235                               const TTypes<const float>::Matrix nominals,
    236                               TTypes<float>::Matrix deltas)
    237       : nominals_(nominals), deltas_(deltas) {
    238     // Create a map from sparse index to the dense index of the underlying
    239     // storage.
    240     for (int64 j = 0; j < indices.size(); ++j) {
    241       indices_to_id_[indices(j)] = j;
    242     }
    243   }
    244 
    245   // Check if a feature index exists.
    246   bool IndexValid(const int64 index) const {
    247     return indices_to_id_.find(index) != indices_to_id_.end();
    248   }
    249 
    250   // Nominal value at a particular feature index and class label.
    251   float nominals(const int class_id, const int64 index) const {
    252     auto it = indices_to_id_.find(index);
    253     return nominals_(class_id, it->second);
    254   }
    255 
    256   // Delta weights durining mini-batch updates.
    257   float deltas(const int class_id, const int64 index) const {
    258     auto it = indices_to_id_.find(index);
    259     return deltas_(class_id, it->second);
    260   }
    261 
    262   // Updates delta weights based on active sparse features in the example and
    263   // the corresponding dual residual.
    264   void UpdateSparseDeltaWeights(
    265       const Eigen::ThreadPoolDevice& device,
    266       const Example::SparseFeatures& sparse_features,
    267       const std::vector<double>& normalized_bounded_dual_delta);
    268 
    269  private:
    270   // The nominal value of the weight for a feature (indexed by its id).
    271   const TTypes<const float>::Matrix nominals_;
    272   // The accumulated delta weight for a feature (indexed by its id).
    273   TTypes<float>::Matrix deltas_;
    274   // Map from feature index to an index to the dense vector.
    275   std::unordered_map<int64, int64> indices_to_id_;
    276 };
    277 
    278 // Weights in the model, wraps both current weights, and the delta weights
    279 // for both sparse and dense features.
    280 class ModelWeights {
    281  public:
    282   ModelWeights() {}
    283 
    284   bool SparseIndexValid(const int col, const int64 index) const {
    285     return sparse_weights_[col].IndexValid(index);
    286   }
    287 
    288   bool DenseIndexValid(const int col, const int64 index) const {
    289     return dense_weights_[col].IndexValid(index);
    290   }
    291 
    292   // Go through all the features present in the example, and update the
    293   // weights based on the dual delta.
    294   void UpdateDeltaWeights(
    295       const Eigen::ThreadPoolDevice& device, const Example& example,
    296       const std::vector<double>& normalized_bounded_dual_delta);
    297 
    298   Status Initialize(OpKernelContext* const context);
    299 
    300   const std::vector<FeatureWeightsSparseStorage>& sparse_weights() const {
    301     return sparse_weights_;
    302   }
    303 
    304   const std::vector<FeatureWeightsDenseStorage>& dense_weights() const {
    305     return dense_weights_;
    306   }
    307 
    308  private:
    309   std::vector<FeatureWeightsSparseStorage> sparse_weights_;
    310   std::vector<FeatureWeightsDenseStorage> dense_weights_;
    311 
    312   TF_DISALLOW_COPY_AND_ASSIGN(ModelWeights);
    313 };
    314 
    315 // Examples contains all the training examples that SDCA uses for a mini-batch.
    316 class Examples {
    317  public:
    318   Examples() {}
    319 
    320   // Returns the Example at |example_index|.
    321   const Example& example(const int example_index) const {
    322     return examples_.at(example_index);
    323   }
    324 
    325   int sampled_index(const int id, const bool adaptative) const {
    326     if (adaptative) return sampled_index_[id];
    327     return id;
    328   }
    329 
    330   // Adaptive SDCA in the current implementation only works for
    331   // binary classification, where the input argument for num_weight_vectors
    332   // is 1.
    333   Status SampleAdaptativeProbabilities(
    334       const int num_loss_partitions, const Regularizations& regularization,
    335       const ModelWeights& model_weights,
    336       const TTypes<float>::Matrix example_state_data,
    337       const std::unique_ptr<DualLossUpdater>& loss_updater,
    338       const int num_weight_vectors);
    339 
    340   int num_examples() const { return examples_.size(); }
    341 
    342   int num_features() const { return num_features_; }
    343 
    344   // Initialize() must be called immediately after construction.
    345   Status Initialize(OpKernelContext* const context, const ModelWeights& weights,
    346                     int num_sparse_features,
    347                     int num_sparse_features_with_values,
    348                     int num_dense_features);
    349 
    350  private:
    351   // Reads the input tensors, and builds the internal representation for sparse
    352   // features per example. This function modifies the |examples| passed in
    353   // to build the sparse representations.
    354   static Status CreateSparseFeatureRepresentation(
    355       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
    356       int num_sparse_features, const ModelWeights& weights,
    357       const OpInputList& sparse_example_indices_inputs,
    358       const OpInputList& sparse_feature_indices_inputs,
    359       const OpInputList& sparse_feature_values_inputs,
    360       std::vector<Example>* const examples);
    361 
    362   // Reads the input tensors, and builds the internal representation for dense
    363   // features per example. This function modifies the |examples| passed in
    364   // to build the sparse representations.
    365   static Status CreateDenseFeatureRepresentation(
    366       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
    367       int num_dense_features, const ModelWeights& weights,
    368       const OpInputList& dense_features_inputs,
    369       std::vector<Example>* const examples);
    370 
    371   // Computes squared example norm per example i.e |x|^2. This function modifies
    372   // the |examples| passed in and adds the squared norm per example.
    373   static void ComputeSquaredNormPerExample(
    374       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
    375       int num_sparse_features, int num_dense_features,
    376       std::vector<Example>* const examples);
    377 
    378   // All examples in the batch.
    379   std::vector<Example> examples_;
    380 
    381   // Adaptative sampling variables
    382   std::vector<float> probabilities_;
    383   std::vector<int> sampled_index_;
    384   std::vector<int> sampled_count_;
    385 
    386   int num_features_ = 0;
    387 
    388   TF_DISALLOW_COPY_AND_ASSIGN(Examples);
    389 };
    390 
    391 }  // namespace sdca
    392 }  // namespace tensorflow
    393 
    394 #endif  // TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
    395