1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_KERNELS_SDCA_INTERNAL_H_ 17 #define TENSORFLOW_KERNELS_SDCA_INTERNAL_H_ 18 19 #define EIGEN_USE_THREADS 20 21 #include <stddef.h> 22 #include <algorithm> 23 #include <cmath> 24 #include <memory> 25 #include <new> 26 #include <unordered_map> 27 #include <utility> 28 #include <vector> 29 30 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 31 #include "tensorflow/core/framework/device_base.h" 32 #include "tensorflow/core/framework/op_kernel.h" 33 #include "tensorflow/core/framework/tensor.h" 34 #include "tensorflow/core/framework/tensor_shape.h" 35 #include "tensorflow/core/framework/tensor_types.h" 36 #include "tensorflow/core/framework/types.h" 37 #include "tensorflow/core/kernels/loss.h" 38 #include "tensorflow/core/lib/core/coding.h" 39 #include "tensorflow/core/lib/core/errors.h" 40 #include "tensorflow/core/lib/core/status.h" 41 #include "tensorflow/core/lib/core/stringpiece.h" 42 #include "tensorflow/core/lib/gtl/inlined_vector.h" 43 #include "tensorflow/core/lib/random/distribution_sampler.h" 44 #include "tensorflow/core/lib/strings/stringprintf.h" 45 #include "tensorflow/core/util/guarded_philox_random.h" 46 #include "tensorflow/core/util/sparse/group_iterator.h" 47 #include "tensorflow/core/util/sparse/sparse_tensor.h" 48 #include "tensorflow/core/util/work_sharder.h" 49 50 namespace tensorflow { 51 52 namespace sdca { 53 54 // Statistics computed with input (ModelWeights, Example). 55 struct ExampleStatistics { 56 // Logits for each class. 57 // For binary case, this should be a vector of length 1; while for multiclass 58 // case, this vector has the same length as the number of classes, where each 59 // value corresponds to one class. 60 // Use InlinedVector to avoid heap allocation for small number of classes. 61 gtl::InlinedVector<double, 1> wx; 62 63 // Logits for each class, using the previous weights. 64 gtl::InlinedVector<double, 1> prev_wx; 65 66 // Sum of squared feature values occurring in the example divided by 67 // L2 * sum(example_weights). 68 double normalized_squared_norm = 0; 69 70 // Num_weight_vectors equals to the number of classification classes in the 71 // multiclass case; while for binary case, it is 1. 72 ExampleStatistics(const int num_weight_vectors) 73 : wx(num_weight_vectors, 0.0), prev_wx(num_weight_vectors, 0.0) {} 74 }; 75 76 class Regularizations { 77 public: 78 Regularizations(){}; 79 80 // Initialize() must be called immediately after construction. 81 Status Initialize(OpKernelConstruction* const context) { 82 TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_)); 83 TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_)); 84 shrinkage_ = symmetric_l1_ / symmetric_l2_; 85 return Status::OK(); 86 } 87 88 // Proximal SDCA shrinking for L1 regularization. 89 double Shrink(const double weight) const { 90 const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0); 91 if (shrinked > 0.0) { 92 return std::copysign(shrinked, weight); 93 } 94 return 0.0; 95 } 96 97 // Vectorized float variant of the above. 98 Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrinkVector( 99 const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const { 100 // Proximal step on the weights which is sign(w)*|w - shrinkage|+. 101 return weights.sign() * ((weights.abs() - weights.constant(shrinkage_)) 102 .cwiseMax(weights.constant(0.0))); 103 } 104 105 // Matrix float variant of the above. 106 Eigen::Tensor<float, 2, Eigen::RowMajor> EigenShrinkMatrix( 107 const Eigen::Tensor<float, 2, Eigen::RowMajor> weights) const { 108 // Proximal step on the weights which is sign(w)*|w - shrinkage|+. 109 return weights.sign() * ((weights.abs() - weights.constant(shrinkage_)) 110 .cwiseMax(weights.constant(0.0))); 111 } 112 113 float symmetric_l2() const { return symmetric_l2_; } 114 115 private: 116 float symmetric_l1_ = 0; 117 float symmetric_l2_ = 0; 118 119 // L1 divided by L2, pre-computed for use during weight shrinking. 120 double shrinkage_ = 0; 121 122 TF_DISALLOW_COPY_AND_ASSIGN(Regularizations); 123 }; 124 125 class ModelWeights; 126 127 // Struct describing a single example. 128 class Example { 129 public: 130 // Compute matrix vector product between weights (a matrix) and features 131 // (a vector). This method also computes the normalized example norm used 132 // in SDCA update. 133 // For multiclass case, num_weight_vectors equals to the number of classes; 134 // while for binary case, it is 1. 135 const ExampleStatistics ComputeWxAndWeightedExampleNorm( 136 const int num_loss_partitions, const ModelWeights& model_weights, 137 const Regularizations& regularization, 138 const int num_weight_vectors) const; 139 140 float example_label() const { return example_label_; } 141 142 float example_weight() const { return example_weight_; } 143 144 double squared_norm() const { return squared_norm_; } 145 146 // Sparse features associated with the example. 147 // Indices and Values are the associated feature index, and values. Values 148 // can be optionally absent, in which we case we implicitly assume a value of 149 // 1.0f. 150 struct SparseFeatures { 151 std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices; 152 std::unique_ptr<TTypes<const float>::UnalignedConstVec> 153 values; // nullptr encodes optional. 154 }; 155 156 // A dense vector which is a row-slice of the underlying matrix. 157 struct DenseVector { 158 // Returns a row slice from the matrix. 159 Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> Row() 160 const { 161 return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>( 162 data_matrix.data() + row_index * data_matrix.dimension(1), 163 data_matrix.dimension(1)); 164 } 165 166 // Returns a row slice as a 1 * F matrix, where F is the number of features. 167 Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>> 168 RowAsMatrix() const { 169 return Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>( 170 data_matrix.data() + row_index * data_matrix.dimension(1), 1, 171 data_matrix.dimension(1)); 172 } 173 174 const TTypes<float>::ConstMatrix data_matrix; 175 const int64 row_index; 176 }; 177 178 private: 179 std::vector<SparseFeatures> sparse_features_; 180 std::vector<std::unique_ptr<DenseVector>> dense_vectors_; 181 182 float example_label_ = 0; 183 float example_weight_ = 0; 184 double squared_norm_ = 0; // sum squared norm of the features. 185 186 // Examples fills Example in a multi-threaded way. 187 friend class Examples; 188 189 // ModelWeights use each example for model update w += \alpha * x_{i}; 190 friend class ModelWeights; 191 }; 192 193 // Weights related to features. For example, say you have two sets of sparse 194 // features i.e. age bracket and country, then FeatureWeightsDenseStorage hold 195 // the parameters for it. We keep track of the original weight passed in and the 196 // delta weight which the optimizer learns in each call to the optimizer. 197 class FeatureWeightsDenseStorage { 198 public: 199 FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals, 200 TTypes<float>::Matrix deltas) 201 : nominals_(nominals), deltas_(deltas) { 202 CHECK(deltas.rank() > 1); 203 } 204 205 // Check if a feature index is with-in the bounds. 206 bool IndexValid(const int64 index) const { 207 return index >= 0 && index < deltas_.dimension(1); 208 } 209 210 // Nominals here are the original weight matrix. 211 TTypes<const float>::Matrix nominals() const { return nominals_; } 212 213 // Delta weights durining mini-batch updates. 214 TTypes<float>::Matrix deltas() const { return deltas_; } 215 216 // Updates delta weights based on active dense features in the example and 217 // the corresponding dual residual. 218 void UpdateDenseDeltaWeights( 219 const Eigen::ThreadPoolDevice& device, 220 const Example::DenseVector& dense_vector, 221 const std::vector<double>& normalized_bounded_dual_delta); 222 223 private: 224 // The nominal value of the weight for a feature (indexed by its id). 225 const TTypes<const float>::Matrix nominals_; 226 // The accumulated delta weight for a feature (indexed by its id). 227 TTypes<float>::Matrix deltas_; 228 }; 229 230 // Similar to FeatureWeightsDenseStorage, but the underlying weights are stored 231 // in an unordered map. 232 class FeatureWeightsSparseStorage { 233 public: 234 FeatureWeightsSparseStorage(const TTypes<const int64>::Vec indices, 235 const TTypes<const float>::Matrix nominals, 236 TTypes<float>::Matrix deltas) 237 : nominals_(nominals), deltas_(deltas) { 238 // Create a map from sparse index to the dense index of the underlying 239 // storage. 240 for (int64 j = 0; j < indices.size(); ++j) { 241 indices_to_id_[indices(j)] = j; 242 } 243 } 244 245 // Check if a feature index exists. 246 bool IndexValid(const int64 index) const { 247 return indices_to_id_.find(index) != indices_to_id_.end(); 248 } 249 250 // Nominal value at a particular feature index and class label. 251 float nominals(const int class_id, const int64 index) const { 252 auto it = indices_to_id_.find(index); 253 return nominals_(class_id, it->second); 254 } 255 256 // Delta weights durining mini-batch updates. 257 float deltas(const int class_id, const int64 index) const { 258 auto it = indices_to_id_.find(index); 259 return deltas_(class_id, it->second); 260 } 261 262 // Updates delta weights based on active sparse features in the example and 263 // the corresponding dual residual. 264 void UpdateSparseDeltaWeights( 265 const Eigen::ThreadPoolDevice& device, 266 const Example::SparseFeatures& sparse_features, 267 const std::vector<double>& normalized_bounded_dual_delta); 268 269 private: 270 // The nominal value of the weight for a feature (indexed by its id). 271 const TTypes<const float>::Matrix nominals_; 272 // The accumulated delta weight for a feature (indexed by its id). 273 TTypes<float>::Matrix deltas_; 274 // Map from feature index to an index to the dense vector. 275 std::unordered_map<int64, int64> indices_to_id_; 276 }; 277 278 // Weights in the model, wraps both current weights, and the delta weights 279 // for both sparse and dense features. 280 class ModelWeights { 281 public: 282 ModelWeights() {} 283 284 bool SparseIndexValid(const int col, const int64 index) const { 285 return sparse_weights_[col].IndexValid(index); 286 } 287 288 bool DenseIndexValid(const int col, const int64 index) const { 289 return dense_weights_[col].IndexValid(index); 290 } 291 292 // Go through all the features present in the example, and update the 293 // weights based on the dual delta. 294 void UpdateDeltaWeights( 295 const Eigen::ThreadPoolDevice& device, const Example& example, 296 const std::vector<double>& normalized_bounded_dual_delta); 297 298 Status Initialize(OpKernelContext* const context); 299 300 const std::vector<FeatureWeightsSparseStorage>& sparse_weights() const { 301 return sparse_weights_; 302 } 303 304 const std::vector<FeatureWeightsDenseStorage>& dense_weights() const { 305 return dense_weights_; 306 } 307 308 private: 309 std::vector<FeatureWeightsSparseStorage> sparse_weights_; 310 std::vector<FeatureWeightsDenseStorage> dense_weights_; 311 312 TF_DISALLOW_COPY_AND_ASSIGN(ModelWeights); 313 }; 314 315 // Examples contains all the training examples that SDCA uses for a mini-batch. 316 class Examples { 317 public: 318 Examples() {} 319 320 // Returns the Example at |example_index|. 321 const Example& example(const int example_index) const { 322 return examples_.at(example_index); 323 } 324 325 int sampled_index(const int id, const bool adaptative) const { 326 if (adaptative) return sampled_index_[id]; 327 return id; 328 } 329 330 // Adaptive SDCA in the current implementation only works for 331 // binary classification, where the input argument for num_weight_vectors 332 // is 1. 333 Status SampleAdaptativeProbabilities( 334 const int num_loss_partitions, const Regularizations& regularization, 335 const ModelWeights& model_weights, 336 const TTypes<float>::Matrix example_state_data, 337 const std::unique_ptr<DualLossUpdater>& loss_updater, 338 const int num_weight_vectors); 339 340 int num_examples() const { return examples_.size(); } 341 342 int num_features() const { return num_features_; } 343 344 // Initialize() must be called immediately after construction. 345 Status Initialize(OpKernelContext* const context, const ModelWeights& weights, 346 int num_sparse_features, 347 int num_sparse_features_with_values, 348 int num_dense_features); 349 350 private: 351 // Reads the input tensors, and builds the internal representation for sparse 352 // features per example. This function modifies the |examples| passed in 353 // to build the sparse representations. 354 static Status CreateSparseFeatureRepresentation( 355 const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples, 356 int num_sparse_features, const ModelWeights& weights, 357 const OpInputList& sparse_example_indices_inputs, 358 const OpInputList& sparse_feature_indices_inputs, 359 const OpInputList& sparse_feature_values_inputs, 360 std::vector<Example>* const examples); 361 362 // Reads the input tensors, and builds the internal representation for dense 363 // features per example. This function modifies the |examples| passed in 364 // to build the sparse representations. 365 static Status CreateDenseFeatureRepresentation( 366 const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples, 367 int num_dense_features, const ModelWeights& weights, 368 const OpInputList& dense_features_inputs, 369 std::vector<Example>* const examples); 370 371 // Computes squared example norm per example i.e |x|^2. This function modifies 372 // the |examples| passed in and adds the squared norm per example. 373 static void ComputeSquaredNormPerExample( 374 const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples, 375 int num_sparse_features, int num_dense_features, 376 std::vector<Example>* const examples); 377 378 // All examples in the batch. 379 std::vector<Example> examples_; 380 381 // Adaptative sampling variables 382 std::vector<float> probabilities_; 383 std::vector<int> sampled_index_; 384 std::vector<int> sampled_count_; 385 386 int num_features_ = 0; 387 388 TF_DISALLOW_COPY_AND_ASSIGN(Examples); 389 }; 390 391 } // namespace sdca 392 } // namespace tensorflow 393 394 #endif // TENSORFLOW_KERNELS_SDCA_INTERNAL_H_ 395