Home | History | Annotate | Download | only in util
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 #include "tensorflow/core/util/example_proto_helper.h"
     16 
     17 #include <vector>
     18 
     19 #include "tensorflow/core/example/example.pb.h"
     20 #include "tensorflow/core/example/feature.pb_text.h"
     21 #include "tensorflow/core/framework/numeric_op.h"
     22 #include "tensorflow/core/framework/register_types.h"
     23 #include "tensorflow/core/lib/core/errors.h"
     24 #include "tensorflow/core/platform/logging.h"
     25 #include "tensorflow/core/platform/protobuf.h"
     26 #include "tensorflow/core/util/sparse/sparse_tensor.h"
     27 
     28 namespace tensorflow {
     29 
     30 Status CheckValidType(const DataType& dtype) {
     31   switch (dtype) {
     32     case DT_INT64:
     33     case DT_FLOAT:
     34     case DT_STRING:
     35       return Status::OK();
     36     default:
     37       return errors::InvalidArgument("Received input dtype: ",
     38                                      DataTypeString(dtype));
     39   }
     40 }
     41 
     42 Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
     43                        bool* match) {
     44   switch (dtype) {
     45     case DT_INT64:
     46       *match = (feature.kind_case() == Feature::kInt64List);
     47       break;
     48     case DT_FLOAT:
     49       *match = (feature.kind_case() == Feature::kFloatList);
     50       break;
     51     case DT_STRING:
     52       *match = (feature.kind_case() == Feature::kBytesList);
     53       break;
     54     default:
     55       return errors::InvalidArgument("Invalid input dtype: ",
     56                                      DataTypeString(dtype));
     57   }
     58   return Status::OK();
     59 }
     60 
     61 Status FeatureDenseCopy(const std::size_t out_index, const string& name,
     62                         const string& key, const DataType& dtype,
     63                         const TensorShape& shape, const Feature& feature,
     64                         Tensor* out) {
     65   const std::size_t num_elements = shape.num_elements();
     66   const std::size_t offset = out_index * num_elements;
     67 
     68   switch (dtype) {
     69     case DT_INT64: {
     70       const Int64List& values = feature.int64_list();
     71       if (static_cast<size_t>(values.value_size()) != num_elements) {
     72         return errors::InvalidArgument(
     73             "Name: ", name, ", Key: ", key, ", Index: ", out_index,
     74             ".  Number of int64 values != expected.  "
     75             "values size: ",
     76             values.value_size(), " but output shape: ", shape.DebugString());
     77       }
     78       auto out_p = out->flat<int64>().data() + offset;
     79       std::copy_n(values.value().data(), num_elements, out_p);
     80       return Status::OK();
     81     }
     82     case DT_FLOAT: {
     83       const FloatList& values = feature.float_list();
     84       if (static_cast<size_t>(values.value_size()) != num_elements) {
     85         return errors::InvalidArgument(
     86             "Name: ", name, ", Key: ", key, ", Index: ", out_index,
     87             ".  Number of float values != expected.  "
     88             "values size: ",
     89             values.value_size(), " but output shape: ", shape.DebugString());
     90       }
     91       auto out_p = out->flat<float>().data() + offset;
     92       std::copy_n(values.value().data(), num_elements, out_p);
     93       return Status::OK();
     94     }
     95     case DT_STRING: {
     96       const BytesList& values = feature.bytes_list();
     97       if (static_cast<size_t>(values.value_size()) != num_elements) {
     98         return errors::InvalidArgument(
     99             "Name: ", name, ", Key ", key, ", Index: ", out_index,
    100             ".  Number of bytes values != expected.  "
    101             "Values size: ",
    102             values.value_size(), " but output shape: ", shape.DebugString());
    103       }
    104       auto out_p = out->flat<string>().data() + offset;
    105       std::transform(values.value().data(),
    106                      values.value().data() + num_elements, out_p,
    107                      [](const string* s) { return *s; });
    108       return Status::OK();
    109     }
    110     default:
    111       return errors::InvalidArgument("Invalid input dtype: ",
    112                                      DataTypeString(dtype));
    113   }
    114 }
    115 
    116 Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
    117                          const DataType& dtype, const Feature& feature) {
    118   switch (dtype) {
    119     case DT_INT64: {
    120       const Int64List& values = feature.int64_list();
    121       const int64 num_elements = values.value_size();
    122       Tensor out(dtype, TensorShape({num_elements}));
    123       auto out_p = out.flat<int64>().data();
    124       std::copy_n(values.value().data(), num_elements, out_p);
    125       return out;
    126     }
    127     case DT_FLOAT: {
    128       const FloatList& values = feature.float_list();
    129       const int64 num_elements = values.value_size();
    130       Tensor out(dtype, TensorShape({num_elements}));
    131       auto out_p = out.flat<float>().data();
    132       std::copy_n(values.value().data(), num_elements, out_p);
    133       return out;
    134     }
    135     case DT_STRING: {
    136       const BytesList& values = feature.bytes_list();
    137       const int64 num_elements = values.value_size();
    138       Tensor out(dtype, TensorShape({num_elements}));
    139       auto out_p = out.flat<string>().data();
    140       std::transform(values.value().data(),
    141                      values.value().data() + num_elements, out_p,
    142                      [](const string* s) { return *s; });
    143       return out;
    144     }
    145     default:
    146       LOG(FATAL) << "not supposed to be here.  dtype requested: " << dtype;
    147   }
    148 }
    149 
    150 int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
    151                            const int64 offset, Tensor* indices,
    152                            Tensor* values) {
    153   const int64 num_elements = in.shape().num_elements();
    154   const DataType& dtype = in.dtype();
    155   CHECK_EQ(dtype, values->dtype());
    156 
    157   // Update indices.
    158   auto ix_t = indices->matrix<int64>();
    159   int64* ix_p = &ix_t(offset, 0);
    160   for (int64 i = 0; i < num_elements; ++i, ix_p += 2) {
    161     *ix_p = batch;    // Column 0 stores the batch entry
    162     *(ix_p + 1) = i;  // Column 1 stores the index in the batch
    163   }
    164 
    165   // Copy values over.
    166   switch (dtype) {
    167     case DT_INT64: {
    168       std::copy_n(in.flat<int64>().data(), num_elements,
    169                   values->flat<int64>().data() + offset);
    170       break;
    171     }
    172     case DT_FLOAT: {
    173       std::copy_n(in.flat<float>().data(), num_elements,
    174                   values->flat<float>().data() + offset);
    175       break;
    176     }
    177     case DT_STRING: {
    178       std::copy_n(in.flat<string>().data(), num_elements,
    179                   values->flat<string>().data() + offset);
    180       break;
    181     }
    182     default:
    183       LOG(FATAL) << "Not supposed to be here.  Saw dtype: " << dtype;
    184   }
    185 
    186   return num_elements;
    187 }
    188 
    189 void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
    190                   const Tensor& in, Tensor* out) {
    191   const std::size_t num_elements = in.shape().num_elements();
    192   const std::size_t offset = out_index * num_elements;
    193 
    194   switch (dtype) {
    195     case DT_INT64: {
    196       std::copy_n(in.flat<int64>().data(), num_elements,
    197                   out->flat<int64>().data() + offset);
    198       break;
    199     }
    200     case DT_FLOAT: {
    201       std::copy_n(in.flat<float>().data(), num_elements,
    202                   out->flat<float>().data() + offset);
    203       break;
    204     }
    205     case DT_STRING: {
    206       std::copy_n(in.flat<string>().data(), num_elements,
    207                   out->flat<string>().data() + offset);
    208       break;
    209     }
    210     default:
    211       LOG(FATAL) << "Not supposed to be here.  Saw dtype: " << dtype;
    212   }
    213 }
    214 
    215 Status SingleExampleProtoToTensors(
    216     const Example& example, const string& example_name, const int batch_index,
    217     const std::vector<FixedLenFeature>& fixed_len_features,
    218     const std::vector<VarLenFeature>& var_len_features,
    219     std::vector<Tensor*>* output_dense_values_tensor,
    220     std::vector<std::vector<Tensor>>* output_sparse_values_tmp) {
    221   const Features& features = example.features();
    222   const auto& feature_dict = features.feature();
    223 
    224   // Handle dense features.
    225   for (size_t d = 0; d < fixed_len_features.size(); ++d) {
    226     const FixedLenFeature& feature_config = fixed_len_features[d];
    227     const string& key = feature_config.key;
    228     const DataType& dtype = feature_config.dtype;
    229     const TensorShape& shape = feature_config.shape;
    230     const Tensor& default_value = feature_config.default_value;
    231     bool required = (default_value.NumElements() == 0);
    232     const auto& feature_found = feature_dict.find(key);
    233     const bool feature_has_data =  // Found key & data type is set
    234         (feature_found != feature_dict.end() &&
    235          (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
    236 
    237     const bool required_ok = feature_has_data || !required;
    238     if (!required_ok) {
    239       return errors::InvalidArgument("Name: ", example_name, ", Feature: ", key,
    240                                      " is required but could not be found.");
    241     }
    242 
    243     // Perform the FeatureDenseCopy into the output dense_values tensor (if
    244     // the value is present).
    245     if (feature_has_data) {
    246       const Feature& f = feature_found->second;
    247       bool types_match;
    248       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
    249       if (!types_match) {
    250         return errors::InvalidArgument("Name: ", example_name,
    251                                        ", Feature: ", key,
    252                                        ".  Data types don't match. ",
    253                                        "Expected type: ", DataTypeString(dtype),
    254                                        "  Feature is: ", ProtoDebugString(f));
    255       }
    256       TF_RETURN_IF_ERROR(FeatureDenseCopy(batch_index, example_name, key, dtype,
    257                                           shape, f,
    258                                           (*output_dense_values_tensor)[d]));
    259     } else {
    260       // If the value is missing, RowDenseCopy the default value.
    261       RowDenseCopy(batch_index, dtype, default_value,
    262                    (*output_dense_values_tensor)[d]);
    263     }
    264   }
    265 
    266   // Handle sparse features.
    267   for (size_t d = 0; d < var_len_features.size(); ++d) {
    268     const VarLenFeature& feature_config = var_len_features[d];
    269     const string& key = feature_config.key;
    270     const DataType& dtype = feature_config.dtype;
    271     const auto& feature_found = feature_dict.find(key);
    272 
    273     const bool feature_has_data =  // Found key & data type is set
    274         (feature_found != feature_dict.end() &&
    275          (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
    276 
    277     if (feature_has_data) {
    278       const Feature& f = feature_found->second;
    279       bool types_match;
    280       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
    281       if (!types_match) {
    282         return errors::InvalidArgument("Name: ", example_name,
    283                                        ", Feature: ", key,
    284                                        ".  Data types don't match. ",
    285                                        "Expected type: ", DataTypeString(dtype),
    286                                        "  Feature is: ", ProtoDebugString(f));
    287       }
    288       (*output_sparse_values_tmp)[d][batch_index] =
    289           FeatureSparseCopy(batch_index, key, dtype, f);
    290     } else {
    291       (*output_sparse_values_tmp)[d][batch_index] =
    292           Tensor(dtype, TensorShape({0}));
    293     }
    294   }
    295   return Status::OK();
    296 }
    297 
    298 Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
    299                              const std::vector<Tensor>& sparse_values_tmp,
    300                              const int batch_size,
    301                              VarLenFeatureBatchShapes* output_shapes) {
    302   int64 total_num_features = 0;
    303   int64 max_num_features = 0;
    304   for (int b = 0; b < batch_size; ++b) {
    305     const Tensor& t = sparse_values_tmp[b];
    306     const int64 num_elements = t.shape().num_elements();
    307     total_num_features += num_elements;
    308     max_num_features = std::max(max_num_features, num_elements);
    309   }
    310   output_shapes->indices_shape.AddDim(total_num_features);
    311   output_shapes->indices_shape.AddDim(2);
    312   output_shapes->values_shape.AddDim(total_num_features);
    313   output_shapes->max_num_features = max_num_features;
    314   return Status::OK();
    315 }
    316 
    317 Status BatchExampleProtoToTensors(
    318     const std::vector<const Example*>& examples,
    319     const std::vector<string>& names,
    320     const std::vector<FixedLenFeature>& fixed_len_features,
    321     const std::vector<VarLenFeature>& var_len_features, Allocator* allocator,
    322     std::vector<Tensor>* output_dense_values_tensor,
    323     std::vector<Tensor>* output_sparse_indices_tensor,
    324     std::vector<Tensor>* output_sparse_values_tensor,
    325     std::vector<Tensor>* output_sparse_shapes_tensor) {
    326   const int batch_size = examples.size();
    327 
    328   const bool has_names = (!names.empty());
    329   if (has_names) {
    330     if (names.size() != examples.size()) {
    331       return errors::InvalidArgument(
    332           "Expected len(names) == len(examples), but got: ", names.size(),
    333           " vs. ", examples.size());
    334     }
    335   }
    336 
    337   // We also need a map of Tensor pointers for the SingleExampleProtoToTensors
    338   // call. (Is there a better solution here?)
    339   std::vector<Tensor*> output_dense_values_tensor_ptrs(
    340       fixed_len_features.size());
    341 
    342   // Preallocate dense_values, since we know their sizes.
    343   for (size_t d = 0; d < fixed_len_features.size(); ++d) {
    344     const FixedLenFeature& config = fixed_len_features[d];
    345     TensorShape out_shape;
    346     out_shape.AddDim(batch_size);
    347     const TensorShape& shape = config.shape;
    348     const DataType& dtype = config.dtype;
    349     for (const int dim : shape.dim_sizes()) out_shape.AddDim(dim);
    350     (*output_dense_values_tensor)[d] = Tensor(allocator, dtype, out_shape);
    351     output_dense_values_tensor_ptrs[d] = &(*output_dense_values_tensor)[d];
    352   }
    353 
    354   // Temporary vector to hold sparse values.
    355   std::vector<std::vector<Tensor>> sparse_values_tmp(var_len_features.size());
    356 
    357   for (size_t d = 0; d < var_len_features.size(); ++d) {
    358     sparse_values_tmp[d] = std::vector<Tensor>(batch_size);
    359   }
    360 
    361   for (size_t b = 0; b < examples.size(); ++b) {
    362     const Example& ex = *(examples[b]);
    363     const string& example_name = (has_names) ? names[b] : "<unknown>";
    364     TF_RETURN_IF_ERROR(SingleExampleProtoToTensors(
    365         ex, example_name, b, fixed_len_features, var_len_features,
    366         &output_dense_values_tensor_ptrs, &sparse_values_tmp));
    367   }
    368 
    369   for (size_t d = 0; d < var_len_features.size(); ++d) {
    370     const VarLenFeature& feature_config = var_len_features[d];
    371     const DataType& dtype = feature_config.dtype;
    372     const std::vector<Tensor>& sparse_values_tensor = sparse_values_tmp[d];
    373 
    374     VarLenFeatureBatchShapes sparse_tensor_batch_shapes;
    375     TF_RETURN_IF_ERROR(GetSparseTensorShapes(feature_config,
    376                                              sparse_values_tensor, batch_size,
    377                                              &sparse_tensor_batch_shapes));
    378     const TensorShape& indices_shape = sparse_tensor_batch_shapes.indices_shape;
    379     const TensorShape& values_shape = sparse_tensor_batch_shapes.values_shape;
    380 
    381     // Allocate the sparse indices here.
    382     (*output_sparse_indices_tensor)[d] =
    383         Tensor(allocator, DT_INT64, indices_shape);
    384     (*output_sparse_values_tensor)[d] = Tensor(allocator, dtype, values_shape);
    385     (*output_sparse_shapes_tensor)[d] =
    386         Tensor(allocator, DT_INT64, TensorShape({2}));
    387 
    388     auto shape_t = (*output_sparse_shapes_tensor)[d].vec<int64>();
    389     shape_t(0) = batch_size;
    390     shape_t(1) = sparse_tensor_batch_shapes.max_num_features;
    391 
    392     Tensor* sp_indices_d = &(*output_sparse_indices_tensor)[d];
    393     Tensor* sp_values_d = &(*output_sparse_values_tensor)[d];
    394 
    395     int64 offset = 0;
    396     for (int b = 0; b < batch_size; ++b) {
    397       const int64 num_elements = CopyIntoSparseTensor(
    398           sparse_values_tensor[b], b, offset, sp_indices_d, sp_values_d);
    399       offset += num_elements;
    400     }
    401   }
    402   return Status::OK();
    403 }
    404 
    405 Status ParseExampleAttrs::FinishInit() {
    406   if (static_cast<size_t>(num_sparse) != sparse_types.size()) {
    407     return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)");
    408   }
    409   if (static_cast<size_t>(num_dense) != dense_types.size()) {
    410     return errors::InvalidArgument("len(dense_keys) != len(dense_types)");
    411   }
    412   if (static_cast<size_t>(num_dense) != dense_shapes.size()) {
    413     return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)");
    414   }
    415   if (num_dense > std::numeric_limits<int32>::max()) {
    416     return errors::InvalidArgument("num_dense_ too large");
    417   }
    418   for (const DataType& type : dense_types) {
    419     TF_RETURN_IF_ERROR(CheckValidType(type));
    420   }
    421   for (const DataType& type : sparse_types) {
    422     TF_RETURN_IF_ERROR(CheckValidType(type));
    423   }
    424   return Status::OK();
    425 }
    426 
    427 Status ParseSingleExampleAttrs::FinishInit() {
    428   if (sparse_keys.size() != sparse_types.size()) {
    429     return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)");
    430   }
    431   if (dense_keys.size() != dense_types.size()) {
    432     return errors::InvalidArgument("len(dense_keys) != len(dense_types)");
    433   }
    434   if (dense_keys.size() != dense_shapes.size()) {
    435     return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)");
    436   }
    437   for (const DataType& type : dense_types) {
    438     TF_RETURN_IF_ERROR(CheckValidType(type));
    439   }
    440   for (const DataType& type : sparse_types) {
    441     TF_RETURN_IF_ERROR(CheckValidType(type));
    442   }
    443   return Status::OK();
    444 }
    445 
    446 Status ParseSingleSequenceExampleAttrs::FinishInit() {
    447   if (static_cast<size_t>(num_context_sparse) != context_sparse_types.size()) {
    448     return errors::InvalidArgument(
    449         "len(context_sparse_keys) != len(context_sparse_types)");
    450   }
    451   if (static_cast<size_t>(num_context_dense) != context_dense_types.size()) {
    452     return errors::InvalidArgument(
    453         "len(context_dense_keys) != len(context_dense_types)");
    454   }
    455   if (static_cast<size_t>(num_context_dense) != context_dense_shapes.size()) {
    456     return errors::InvalidArgument(
    457         "len(context_dense_keys) != len(context_dense_shapes)");
    458   }
    459   if (static_cast<size_t>(num_feature_list_sparse) !=
    460       feature_list_sparse_types.size()) {
    461     return errors::InvalidArgument(
    462         "len(feature_list_sparse_keys) != len(feature_list_sparse_types)");
    463   }
    464   if (static_cast<size_t>(num_feature_list_dense) !=
    465       feature_list_dense_types.size()) {
    466     return errors::InvalidArgument(
    467         "len(feature_list_dense_keys) != "
    468         "len(feature_list_dense_types)");
    469   }
    470   for (const DataType& type : context_dense_types) {
    471     TF_RETURN_IF_ERROR(CheckValidType(type));
    472   }
    473   for (const DataType& type : context_sparse_types) {
    474     TF_RETURN_IF_ERROR(CheckValidType(type));
    475   }
    476   for (const DataType& type : feature_list_dense_types) {
    477     TF_RETURN_IF_ERROR(CheckValidType(type));
    478   }
    479   for (const DataType& type : feature_list_sparse_types) {
    480     TF_RETURN_IF_ERROR(CheckValidType(type));
    481   }
    482   return Status::OK();
    483 }
    484 
    485 }  // namespace tensorflow
    486