Home | History | Annotate | Download | only in kernels
      1 // Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 // =============================================================================
     15 #ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
     16 #define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
     17 
     18 #include <limits>
     19 
     20 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
     21 #include "tensorflow/core/framework/op_kernel.h"
     22 #include "tensorflow/core/framework/tensor.h"
     23 #include "tensorflow/core/framework/tensor_types.h"
     24 #include "tensorflow/core/kernels/bounds_check.h"
     25 #include "tensorflow/core/lib/random/distribution_sampler.h"
     26 #include "tensorflow/core/lib/random/simple_philox.h"
     27 #include "tensorflow/core/lib/strings/strcat.h"
     28 #include "tensorflow/core/platform/macros.h"
     29 #include "tensorflow/core/platform/types.h"
     30 
     31 namespace tensorflow {
     32 namespace tensorforest {
     33 
     34 // We hide Eigen's hideous types behind a function that returns the (i, j)-th
     35 // entry of a two dimensional tensor; this is that function's type.
     36 using GetFeatureFnType = std::function<float(int32, int32)>;
     37 
     38 // TODO(gilberth): Put these in protos so they can be shared by C++ and python.
     39 // Indexes in the tree representation's 2nd dimension for children and features.
     40 const int32 CHILDREN_INDEX = 0;
     41 const int32 FEATURE_INDEX = 1;
     42 
     43 // Used in the tree's children sub-tensor to indicate leaf and free nodes.
     44 const int32 LEAF_NODE = -1;
     45 const int32 FREE_NODE = -2;
     46 
     47 // Used to indicate column types, e.g. categorical vs. float
     48 enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
     49 
     50 // Calculates the sum of a tensor.
     51 template <typename T>
     52 T Sum(Tensor counts) {
     53   Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
     54       counts.unaligned_flat<T>().sum();
     55   return count_sum(0);
     56 }
     57 
     58 // Get the two best scores and their indices among max splits.
     59 void GetTwoBest(int max, const std::function<float(int)>& score_fn,
     60                 float* best_score, int* best_index, float* second_best_score,
     61                 int* second_best_index);
     62 
     63 // If the Gini Impurity is g, this returns n^2 (g - 1).  This is an int
     64 // rather than a float, and so can be more easily checked for ties.
     65 int BootstrapGini(int n, int s, const random::DistributionSampler& ds,
     66                   random::SimplePhilox* rand);
     67 
     68 // Get the DataColumnTypes number for the given feature.
     69 DataColumnTypes FindDenseFeatureSpec(
     70     int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
     71 DataColumnTypes FindSparseFeatureSpec(
     72     int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
     73 
     74 // Given an Eigen::Tensor type, calculate the Gini impurity.
     75 template <typename T>
     76 float RawWeightedGiniImpurity(const T& counts) {
     77   // Our split score is the Gini impurity times the number of examples
     78   // seen by the leaf.  If c(i) denotes the i-th class count and c = sum_i c(i)
     79   // then
     80   // score = c * (1 - sum_i ( c(i) / c )^2 )
     81   //       = c - sum_i c(i)^2 / c
     82   const auto sum = counts.sum();
     83   const auto sum2 = counts.square().sum();
     84   Eigen::Tensor<float, 0, Eigen::RowMajor> ret = sum - (sum2 / sum);
     85   return ret(0);
     86 }
     87 
     88 // Given an Eigen::Tensor type, calculate the smoothed Gini impurity, which we
     89 // use to determine the best split (lowest) and which nodes to allocate first
     90 // (highest).
     91 template <typename T>
     92 float WeightedGiniImpurity(const T& counts) {
     93   const auto smoothed = counts + counts.constant(1.0f);
     94   return RawWeightedGiniImpurity(smoothed);
     95 }
     96 
     97 template <typename T1, typename T2>
     98 float WeightedVariance(const T1& sums, const T2& squares, float count) {
     99   const auto e_x = sums / count;
    100   const auto e_x2 = squares / count;
    101   Eigen::Tensor<float, 0, Eigen::RowMajor> ret = (e_x2 - e_x.square()).sum();
    102   return count * ret(0);
    103 }
    104 
    105 // Returns the best split to use based on the (lowest) Gini impurity.
    106 // Takes in the whole total and per-split count tensors because using
    107 // Tensor::Slice returns a tensor of the same dimensionality, which makes
    108 // things a little awkward.
    109 int32 BestFeatureClassification(const Tensor& total_counts,
    110                                 const Tensor& split_counts, int32 accumulator);
    111 
    112 // Returns the best split to use based on the (lowest) variance.
    113 int32 BestFeatureRegression(const Tensor& total_sums,
    114                             const Tensor& total_squares,
    115                             const Tensor& split_sums,
    116                             const Tensor& split_squares, int32 accumulator);
    117 
    118 // Returns true if the best split's variance is sufficiently smaller than
    119 // that of the next best split.
    120 bool BestSplitDominatesRegression(const Tensor& total_sums,
    121                                   const Tensor& total_squares,
    122                                   const Tensor& split_sums,
    123                                   const Tensor& split_squares,
    124                                   int32 accumulator);
    125 
    126 // Performs booststrap_samples bootstrap samples of the best split's class
    127 // counts and the second best splits's class counts, and returns true if at
    128 // least dominate_fraction of the time, the former has a better (lower)
    129 // Gini impurity.  Does not take over ownership of *rand.
    130 bool BestSplitDominatesClassificationBootstrap(
    131     const Tensor& total_counts, const Tensor& split_counts, int32 accumulator,
    132     float dominate_fraction, tensorflow::random::SimplePhilox* rand);
    133 
    134 // Returns true if the best split's Gini impurity is sufficiently smaller than
    135 // that of the next best split, as measured by the Hoeffding Tree bound.
    136 bool BestSplitDominatesClassificationHoeffding(const Tensor& total_counts,
    137                                                const Tensor& split_counts,
    138                                                int32 accumulator,
    139                                                float dominate_fraction);
    140 
    141 // Returns true if the best split's Gini impurity is sufficiently smaller than
    142 // that of the next best split, as measured by a Chebyshev bound.
    143 bool BestSplitDominatesClassificationChebyshev(const Tensor& total_counts,
    144                                                const Tensor& split_counts,
    145                                                int32 accumulator,
    146                                                float dominate_fraction);
    147 
    148 // Initializes everything in the given tensor to the given value.
    149 template <typename T>
    150 void Initialize(Tensor counts, T val = 0) {
    151   auto flat = counts.unaligned_flat<T>();
    152   std::fill(flat.data(), flat.data() + flat.size(), val);
    153 }
    154 
    155 // Returns a function that accesses the (i,j)-th element of the specified two
    156 // dimensional tensor.
    157 GetFeatureFnType GetDenseFunctor(const Tensor& dense);
    158 
    159 // Returns a function that looks for the j-th feature of the i-th example
    160 // in the sparse data, or the default value if not found.  See FindSparseValue.
    161 GetFeatureFnType GetSparseFunctor(const Tensor& sparse_indices,
    162                                   const Tensor& sparse_values);
    163 
    164 // Returns true if the point falls to the right (i.e., the selected feature
    165 // of the input point is greater than the bias threshold), and false if it
    166 // falls to the left.
    167 // Even though our input data is forced into float Tensors, it could have
    168 // originally been something else (e.g. categorical string data) which
    169 // we treat differently.
    170 bool DecideNode(const GetFeatureFnType& get_dense,
    171                 const GetFeatureFnType& get_sparse, int32 i, int32 feature,
    172                 float bias, const tensorforest::TensorForestDataSpec& spec);
    173 
    174 // If T is a sparse float matrix represented by sparse_input_indices and
    175 // sparse_input_values, FindSparseValue returns T(i,j), or 0.0 if (i,j)
    176 // isn't present in sparse_input_indices.  sparse_input_indices is assumed
    177 // to be sorted.
    178 template <typename T1, typename T2>
    179 float FindSparseValue(const T1& sparse_input_indices,
    180                       const T2& sparse_input_values, int32 i, int32 j) {
    181   int32 low = 0;
    182   int32 high = sparse_input_values.dimension(0);
    183   while (low < high) {
    184     int32 mid = (low + high) / 2;
    185     int64 midi = internal::SubtleMustCopy(sparse_input_indices(mid, 0));
    186     int64 midj = internal::SubtleMustCopy(sparse_input_indices(mid, 1));
    187     if (midi == i) {
    188       if (midj == j) {
    189         return sparse_input_values(mid);
    190       }
    191       if (midj < j) {
    192         low = mid + 1;
    193       } else {
    194         high = mid;
    195       }
    196       continue;
    197     }
    198     if (midi < i) {
    199       low = mid + 1;
    200     } else {
    201       high = mid;
    202     }
    203   }
    204   return 0.0;
    205 }
    206 
    207 // Returns the number of sparse values for example input_index.
    208 // Also returns the index where those features start in sparse_input_start
    209 // if any were found.
    210 // Assumes that the first column in indices is ordered.
    211 template <typename T1>
    212 int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
    213                            int64* sparse_input_start) {
    214   // Binary search for input_index.
    215   // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
    216   // for a simpler but possibly slower solution, or searching for
    217   // input_start and input_end simultaneously.
    218   const int64 num_total = indices.dimension(0);
    219   int64 index;
    220   int64 low = 0;
    221   int64 high = num_total;
    222   *sparse_input_start = -1;  // Easy error checking.
    223 
    224   while (true) {
    225     if (low == high) {
    226       return 0;
    227     }
    228     index = low + (high - low) / 2;
    229     const int64 feature_index = indices(index, 0);
    230     if (feature_index == input_index) {
    231       // found it.
    232       break;
    233     } else if (feature_index < input_index) {
    234       // Correct for the implicit floor in the index assignment.
    235       if (low == index) {
    236         return 0;
    237       }
    238       low = index;
    239     } else {
    240       high = index;
    241     }
    242   }
    243 
    244   // Scan for the start and end of the input_index range.
    245   int64 input_start = index;
    246   int64 val = indices(input_start, 0);
    247   while (val == input_index) {
    248     --input_start;
    249     if (input_start < 0) {
    250       break;
    251     }
    252     val = indices(input_start, 0);
    253   }
    254   *sparse_input_start = input_start + 1;
    255   int32 input_end = index;
    256   val = indices(input_end, 0);
    257   while (val == input_index) {
    258     ++input_end;
    259     if (input_end >= num_total) {
    260       break;
    261     }
    262     val = indices(input_end, 0);
    263   }
    264   return input_end - input_start - 1;
    265 }
    266 
    267 // Returns left/right decision between the input value and the threshold bias.
    268 // For floating point types, the decision is value > bias, but for
    269 // categorical data, it is value != bias.
    270 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
    271 
    272 // Returns true if all the splits are initialized. Since they get initialized
    273 // in order, we can simply infer this from the last split.
    274 // This should only be called for a single allocator's candidate features
    275 // (i.e. candidate_split_features.Slice(accumulator, accumulator + 1) ).
    276 template <typename EigenType>
    277 bool IsAllInitialized(const EigenType& features, int32 accumulator,
    278                       int32 num_splits) {
    279   return features(accumulator, num_splits - 1) >= 0;
    280 }
    281 
    282 // Tensorforest currently only allows tensors up to 2^31 elements.  Return false
    283 // if any dimension is greater than that, true otherwise.
    284 inline bool CheckTensorBounds(OpKernelContext* context, const Tensor& tensor) {
    285   for (int i = 0; i < (tensor).dims(); ++i) {
    286     if (!TF_PREDICT_TRUE(tensor.shape().dim_size(i) <
    287                          std::numeric_limits<int32>::max())) {
    288       context->CtxFailure((errors::InvalidArgument(
    289           strings::StrCat("Tensor has a dimension that is greater than 2^31: ",
    290                           tensor.DebugString()))));
    291       return false;
    292     }
    293   }
    294   return true;
    295 }
    296 
    297 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
    298                            float parent_sum, const float* parent_data,
    299                            float valid_leaf_threshold, int num_outputs,
    300                            std::vector<float>* mean);
    301 
    302 }  // namespace tensorforest
    303 }  // namespace tensorflow
    304 
    305 #endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
    306