1 // Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // ============================================================================= 15 #ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_ 16 #define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_ 17 18 #include <limits> 19 20 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h" 21 #include "tensorflow/core/framework/op_kernel.h" 22 #include "tensorflow/core/framework/tensor.h" 23 #include "tensorflow/core/framework/tensor_types.h" 24 #include "tensorflow/core/kernels/bounds_check.h" 25 #include "tensorflow/core/lib/random/distribution_sampler.h" 26 #include "tensorflow/core/lib/random/simple_philox.h" 27 #include "tensorflow/core/lib/strings/strcat.h" 28 #include "tensorflow/core/platform/macros.h" 29 #include "tensorflow/core/platform/types.h" 30 31 namespace tensorflow { 32 namespace tensorforest { 33 34 // We hide Eigen's hideous types behind a function that returns the (i, j)-th 35 // entry of a two dimensional tensor; this is that function's type. 36 using GetFeatureFnType = std::function<float(int32, int32)>; 37 38 // TODO(gilberth): Put these in protos so they can be shared by C++ and python. 39 // Indexes in the tree representation's 2nd dimension for children and features. 40 const int32 CHILDREN_INDEX = 0; 41 const int32 FEATURE_INDEX = 1; 42 43 // Used in the tree's children sub-tensor to indicate leaf and free nodes. 44 const int32 LEAF_NODE = -1; 45 const int32 FREE_NODE = -2; 46 47 // Used to indicate column types, e.g. categorical vs. float 48 enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 }; 49 50 // Calculates the sum of a tensor. 51 template <typename T> 52 T Sum(Tensor counts) { 53 Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum = 54 counts.unaligned_flat<T>().sum(); 55 return count_sum(0); 56 } 57 58 // Get the two best scores and their indices among max splits. 59 void GetTwoBest(int max, const std::function<float(int)>& score_fn, 60 float* best_score, int* best_index, float* second_best_score, 61 int* second_best_index); 62 63 // If the Gini Impurity is g, this returns n^2 (g - 1). This is an int 64 // rather than a float, and so can be more easily checked for ties. 65 int BootstrapGini(int n, int s, const random::DistributionSampler& ds, 66 random::SimplePhilox* rand); 67 68 // Get the DataColumnTypes number for the given feature. 69 DataColumnTypes FindDenseFeatureSpec( 70 int32 input_feature, const tensorforest::TensorForestDataSpec& spec); 71 DataColumnTypes FindSparseFeatureSpec( 72 int32 input_feature, const tensorforest::TensorForestDataSpec& spec); 73 74 // Given an Eigen::Tensor type, calculate the Gini impurity. 75 template <typename T> 76 float RawWeightedGiniImpurity(const T& counts) { 77 // Our split score is the Gini impurity times the number of examples 78 // seen by the leaf. If c(i) denotes the i-th class count and c = sum_i c(i) 79 // then 80 // score = c * (1 - sum_i ( c(i) / c )^2 ) 81 // = c - sum_i c(i)^2 / c 82 const auto sum = counts.sum(); 83 const auto sum2 = counts.square().sum(); 84 Eigen::Tensor<float, 0, Eigen::RowMajor> ret = sum - (sum2 / sum); 85 return ret(0); 86 } 87 88 // Given an Eigen::Tensor type, calculate the smoothed Gini impurity, which we 89 // use to determine the best split (lowest) and which nodes to allocate first 90 // (highest). 91 template <typename T> 92 float WeightedGiniImpurity(const T& counts) { 93 const auto smoothed = counts + counts.constant(1.0f); 94 return RawWeightedGiniImpurity(smoothed); 95 } 96 97 template <typename T1, typename T2> 98 float WeightedVariance(const T1& sums, const T2& squares, float count) { 99 const auto e_x = sums / count; 100 const auto e_x2 = squares / count; 101 Eigen::Tensor<float, 0, Eigen::RowMajor> ret = (e_x2 - e_x.square()).sum(); 102 return count * ret(0); 103 } 104 105 // Returns the best split to use based on the (lowest) Gini impurity. 106 // Takes in the whole total and per-split count tensors because using 107 // Tensor::Slice returns a tensor of the same dimensionality, which makes 108 // things a little awkward. 109 int32 BestFeatureClassification(const Tensor& total_counts, 110 const Tensor& split_counts, int32 accumulator); 111 112 // Returns the best split to use based on the (lowest) variance. 113 int32 BestFeatureRegression(const Tensor& total_sums, 114 const Tensor& total_squares, 115 const Tensor& split_sums, 116 const Tensor& split_squares, int32 accumulator); 117 118 // Returns true if the best split's variance is sufficiently smaller than 119 // that of the next best split. 120 bool BestSplitDominatesRegression(const Tensor& total_sums, 121 const Tensor& total_squares, 122 const Tensor& split_sums, 123 const Tensor& split_squares, 124 int32 accumulator); 125 126 // Performs booststrap_samples bootstrap samples of the best split's class 127 // counts and the second best splits's class counts, and returns true if at 128 // least dominate_fraction of the time, the former has a better (lower) 129 // Gini impurity. Does not take over ownership of *rand. 130 bool BestSplitDominatesClassificationBootstrap( 131 const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, 132 float dominate_fraction, tensorflow::random::SimplePhilox* rand); 133 134 // Returns true if the best split's Gini impurity is sufficiently smaller than 135 // that of the next best split, as measured by the Hoeffding Tree bound. 136 bool BestSplitDominatesClassificationHoeffding(const Tensor& total_counts, 137 const Tensor& split_counts, 138 int32 accumulator, 139 float dominate_fraction); 140 141 // Returns true if the best split's Gini impurity is sufficiently smaller than 142 // that of the next best split, as measured by a Chebyshev bound. 143 bool BestSplitDominatesClassificationChebyshev(const Tensor& total_counts, 144 const Tensor& split_counts, 145 int32 accumulator, 146 float dominate_fraction); 147 148 // Initializes everything in the given tensor to the given value. 149 template <typename T> 150 void Initialize(Tensor counts, T val = 0) { 151 auto flat = counts.unaligned_flat<T>(); 152 std::fill(flat.data(), flat.data() + flat.size(), val); 153 } 154 155 // Returns a function that accesses the (i,j)-th element of the specified two 156 // dimensional tensor. 157 GetFeatureFnType GetDenseFunctor(const Tensor& dense); 158 159 // Returns a function that looks for the j-th feature of the i-th example 160 // in the sparse data, or the default value if not found. See FindSparseValue. 161 GetFeatureFnType GetSparseFunctor(const Tensor& sparse_indices, 162 const Tensor& sparse_values); 163 164 // Returns true if the point falls to the right (i.e., the selected feature 165 // of the input point is greater than the bias threshold), and false if it 166 // falls to the left. 167 // Even though our input data is forced into float Tensors, it could have 168 // originally been something else (e.g. categorical string data) which 169 // we treat differently. 170 bool DecideNode(const GetFeatureFnType& get_dense, 171 const GetFeatureFnType& get_sparse, int32 i, int32 feature, 172 float bias, const tensorforest::TensorForestDataSpec& spec); 173 174 // If T is a sparse float matrix represented by sparse_input_indices and 175 // sparse_input_values, FindSparseValue returns T(i,j), or 0.0 if (i,j) 176 // isn't present in sparse_input_indices. sparse_input_indices is assumed 177 // to be sorted. 178 template <typename T1, typename T2> 179 float FindSparseValue(const T1& sparse_input_indices, 180 const T2& sparse_input_values, int32 i, int32 j) { 181 int32 low = 0; 182 int32 high = sparse_input_values.dimension(0); 183 while (low < high) { 184 int32 mid = (low + high) / 2; 185 int64 midi = internal::SubtleMustCopy(sparse_input_indices(mid, 0)); 186 int64 midj = internal::SubtleMustCopy(sparse_input_indices(mid, 1)); 187 if (midi == i) { 188 if (midj == j) { 189 return sparse_input_values(mid); 190 } 191 if (midj < j) { 192 low = mid + 1; 193 } else { 194 high = mid; 195 } 196 continue; 197 } 198 if (midi < i) { 199 low = mid + 1; 200 } else { 201 high = mid; 202 } 203 } 204 return 0.0; 205 } 206 207 // Returns the number of sparse values for example input_index. 208 // Also returns the index where those features start in sparse_input_start 209 // if any were found. 210 // Assumes that the first column in indices is ordered. 211 template <typename T1> 212 int32 GetNumSparseFeatures(const T1& indices, int32 input_index, 213 int64* sparse_input_start) { 214 // Binary search for input_index. 215 // TODO(gilberth): Consider using std::lower_bound, std::upper_bound 216 // for a simpler but possibly slower solution, or searching for 217 // input_start and input_end simultaneously. 218 const int64 num_total = indices.dimension(0); 219 int64 index; 220 int64 low = 0; 221 int64 high = num_total; 222 *sparse_input_start = -1; // Easy error checking. 223 224 while (true) { 225 if (low == high) { 226 return 0; 227 } 228 index = low + (high - low) / 2; 229 const int64 feature_index = indices(index, 0); 230 if (feature_index == input_index) { 231 // found it. 232 break; 233 } else if (feature_index < input_index) { 234 // Correct for the implicit floor in the index assignment. 235 if (low == index) { 236 return 0; 237 } 238 low = index; 239 } else { 240 high = index; 241 } 242 } 243 244 // Scan for the start and end of the input_index range. 245 int64 input_start = index; 246 int64 val = indices(input_start, 0); 247 while (val == input_index) { 248 --input_start; 249 if (input_start < 0) { 250 break; 251 } 252 val = indices(input_start, 0); 253 } 254 *sparse_input_start = input_start + 1; 255 int32 input_end = index; 256 val = indices(input_end, 0); 257 while (val == input_index) { 258 ++input_end; 259 if (input_end >= num_total) { 260 break; 261 } 262 val = indices(input_end, 0); 263 } 264 return input_end - input_start - 1; 265 } 266 267 // Returns left/right decision between the input value and the threshold bias. 268 // For floating point types, the decision is value > bias, but for 269 // categorical data, it is value != bias. 270 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat); 271 272 // Returns true if all the splits are initialized. Since they get initialized 273 // in order, we can simply infer this from the last split. 274 // This should only be called for a single allocator's candidate features 275 // (i.e. candidate_split_features.Slice(accumulator, accumulator + 1) ). 276 template <typename EigenType> 277 bool IsAllInitialized(const EigenType& features, int32 accumulator, 278 int32 num_splits) { 279 return features(accumulator, num_splits - 1) >= 0; 280 } 281 282 // Tensorforest currently only allows tensors up to 2^31 elements. Return false 283 // if any dimension is greater than that, true otherwise. 284 inline bool CheckTensorBounds(OpKernelContext* context, const Tensor& tensor) { 285 for (int i = 0; i < (tensor).dims(); ++i) { 286 if (!TF_PREDICT_TRUE(tensor.shape().dim_size(i) < 287 std::numeric_limits<int32>::max())) { 288 context->CtxFailure((errors::InvalidArgument( 289 strings::StrCat("Tensor has a dimension that is greater than 2^31: ", 290 tensor.DebugString())))); 291 return false; 292 } 293 } 294 return true; 295 } 296 297 void GetParentWeightedMean(float leaf_sum, const float* leaf_data, 298 float parent_sum, const float* parent_data, 299 float valid_leaf_threshold, int num_outputs, 300 std::vector<float>* mean); 301 302 } // namespace tensorforest 303 } // namespace tensorflow 304 305 #endif // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_ 306