1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #include "tensorflow/core/util/example_proto_helper.h" 16 17 #include <vector> 18 19 #include "tensorflow/core/example/example.pb.h" 20 #include "tensorflow/core/example/feature.pb_text.h" 21 #include "tensorflow/core/framework/numeric_op.h" 22 #include "tensorflow/core/framework/register_types.h" 23 #include "tensorflow/core/lib/core/errors.h" 24 #include "tensorflow/core/platform/logging.h" 25 #include "tensorflow/core/platform/protobuf.h" 26 #include "tensorflow/core/util/sparse/sparse_tensor.h" 27 28 namespace tensorflow { 29 30 Status CheckValidType(const DataType& dtype) { 31 switch (dtype) { 32 case DT_INT64: 33 case DT_FLOAT: 34 case DT_STRING: 35 return Status::OK(); 36 default: 37 return errors::InvalidArgument("Received input dtype: ", 38 DataTypeString(dtype)); 39 } 40 } 41 42 Status CheckTypesMatch(const Feature& feature, const DataType& dtype, 43 bool* match) { 44 switch (dtype) { 45 case DT_INT64: 46 *match = (feature.kind_case() == Feature::kInt64List); 47 break; 48 case DT_FLOAT: 49 *match = (feature.kind_case() == Feature::kFloatList); 50 break; 51 case DT_STRING: 52 *match = (feature.kind_case() == Feature::kBytesList); 53 break; 54 default: 55 return errors::InvalidArgument("Invalid input dtype: ", 56 DataTypeString(dtype)); 57 } 58 return Status::OK(); 59 } 60 61 Status FeatureDenseCopy(const std::size_t out_index, const string& name, 62 const string& key, const DataType& dtype, 63 const TensorShape& shape, const Feature& feature, 64 Tensor* out) { 65 const std::size_t num_elements = shape.num_elements(); 66 const std::size_t offset = out_index * num_elements; 67 68 switch (dtype) { 69 case DT_INT64: { 70 const Int64List& values = feature.int64_list(); 71 if (static_cast<size_t>(values.value_size()) != num_elements) { 72 return errors::InvalidArgument( 73 "Name: ", name, ", Key: ", key, ", Index: ", out_index, 74 ". Number of int64 values != expected. " 75 "values size: ", 76 values.value_size(), " but output shape: ", shape.DebugString()); 77 } 78 auto out_p = out->flat<int64>().data() + offset; 79 std::copy_n(values.value().data(), num_elements, out_p); 80 return Status::OK(); 81 } 82 case DT_FLOAT: { 83 const FloatList& values = feature.float_list(); 84 if (static_cast<size_t>(values.value_size()) != num_elements) { 85 return errors::InvalidArgument( 86 "Name: ", name, ", Key: ", key, ", Index: ", out_index, 87 ". Number of float values != expected. " 88 "values size: ", 89 values.value_size(), " but output shape: ", shape.DebugString()); 90 } 91 auto out_p = out->flat<float>().data() + offset; 92 std::copy_n(values.value().data(), num_elements, out_p); 93 return Status::OK(); 94 } 95 case DT_STRING: { 96 const BytesList& values = feature.bytes_list(); 97 if (static_cast<size_t>(values.value_size()) != num_elements) { 98 return errors::InvalidArgument( 99 "Name: ", name, ", Key ", key, ", Index: ", out_index, 100 ". Number of bytes values != expected. " 101 "Values size: ", 102 values.value_size(), " but output shape: ", shape.DebugString()); 103 } 104 auto out_p = out->flat<string>().data() + offset; 105 std::transform(values.value().data(), 106 values.value().data() + num_elements, out_p, 107 [](const string* s) { return *s; }); 108 return Status::OK(); 109 } 110 default: 111 return errors::InvalidArgument("Invalid input dtype: ", 112 DataTypeString(dtype)); 113 } 114 } 115 116 Tensor FeatureSparseCopy(const std::size_t batch, const string& key, 117 const DataType& dtype, const Feature& feature) { 118 switch (dtype) { 119 case DT_INT64: { 120 const Int64List& values = feature.int64_list(); 121 const int64 num_elements = values.value_size(); 122 Tensor out(dtype, TensorShape({num_elements})); 123 auto out_p = out.flat<int64>().data(); 124 std::copy_n(values.value().data(), num_elements, out_p); 125 return out; 126 } 127 case DT_FLOAT: { 128 const FloatList& values = feature.float_list(); 129 const int64 num_elements = values.value_size(); 130 Tensor out(dtype, TensorShape({num_elements})); 131 auto out_p = out.flat<float>().data(); 132 std::copy_n(values.value().data(), num_elements, out_p); 133 return out; 134 } 135 case DT_STRING: { 136 const BytesList& values = feature.bytes_list(); 137 const int64 num_elements = values.value_size(); 138 Tensor out(dtype, TensorShape({num_elements})); 139 auto out_p = out.flat<string>().data(); 140 std::transform(values.value().data(), 141 values.value().data() + num_elements, out_p, 142 [](const string* s) { return *s; }); 143 return out; 144 } 145 default: 146 LOG(FATAL) << "not supposed to be here. dtype requested: " << dtype; 147 } 148 } 149 150 int64 CopyIntoSparseTensor(const Tensor& in, const int batch, 151 const int64 offset, Tensor* indices, 152 Tensor* values) { 153 const int64 num_elements = in.shape().num_elements(); 154 const DataType& dtype = in.dtype(); 155 CHECK_EQ(dtype, values->dtype()); 156 157 // Update indices. 158 auto ix_t = indices->matrix<int64>(); 159 int64* ix_p = &ix_t(offset, 0); 160 for (int64 i = 0; i < num_elements; ++i, ix_p += 2) { 161 *ix_p = batch; // Column 0 stores the batch entry 162 *(ix_p + 1) = i; // Column 1 stores the index in the batch 163 } 164 165 // Copy values over. 166 switch (dtype) { 167 case DT_INT64: { 168 std::copy_n(in.flat<int64>().data(), num_elements, 169 values->flat<int64>().data() + offset); 170 break; 171 } 172 case DT_FLOAT: { 173 std::copy_n(in.flat<float>().data(), num_elements, 174 values->flat<float>().data() + offset); 175 break; 176 } 177 case DT_STRING: { 178 std::copy_n(in.flat<string>().data(), num_elements, 179 values->flat<string>().data() + offset); 180 break; 181 } 182 default: 183 LOG(FATAL) << "Not supposed to be here. Saw dtype: " << dtype; 184 } 185 186 return num_elements; 187 } 188 189 void RowDenseCopy(const std::size_t& out_index, const DataType& dtype, 190 const Tensor& in, Tensor* out) { 191 const std::size_t num_elements = in.shape().num_elements(); 192 const std::size_t offset = out_index * num_elements; 193 194 switch (dtype) { 195 case DT_INT64: { 196 std::copy_n(in.flat<int64>().data(), num_elements, 197 out->flat<int64>().data() + offset); 198 break; 199 } 200 case DT_FLOAT: { 201 std::copy_n(in.flat<float>().data(), num_elements, 202 out->flat<float>().data() + offset); 203 break; 204 } 205 case DT_STRING: { 206 std::copy_n(in.flat<string>().data(), num_elements, 207 out->flat<string>().data() + offset); 208 break; 209 } 210 default: 211 LOG(FATAL) << "Not supposed to be here. Saw dtype: " << dtype; 212 } 213 } 214 215 Status SingleExampleProtoToTensors( 216 const Example& example, const string& example_name, const int batch_index, 217 const std::vector<FixedLenFeature>& fixed_len_features, 218 const std::vector<VarLenFeature>& var_len_features, 219 std::vector<Tensor*>* output_dense_values_tensor, 220 std::vector<std::vector<Tensor>>* output_sparse_values_tmp) { 221 const Features& features = example.features(); 222 const auto& feature_dict = features.feature(); 223 224 // Handle dense features. 225 for (size_t d = 0; d < fixed_len_features.size(); ++d) { 226 const FixedLenFeature& feature_config = fixed_len_features[d]; 227 const string& key = feature_config.key; 228 const DataType& dtype = feature_config.dtype; 229 const TensorShape& shape = feature_config.shape; 230 const Tensor& default_value = feature_config.default_value; 231 bool required = (default_value.NumElements() == 0); 232 const auto& feature_found = feature_dict.find(key); 233 const bool feature_has_data = // Found key & data type is set 234 (feature_found != feature_dict.end() && 235 (feature_found->second.kind_case() != Feature::KIND_NOT_SET)); 236 237 const bool required_ok = feature_has_data || !required; 238 if (!required_ok) { 239 return errors::InvalidArgument("Name: ", example_name, ", Feature: ", key, 240 " is required but could not be found."); 241 } 242 243 // Perform the FeatureDenseCopy into the output dense_values tensor (if 244 // the value is present). 245 if (feature_has_data) { 246 const Feature& f = feature_found->second; 247 bool types_match; 248 TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match)); 249 if (!types_match) { 250 return errors::InvalidArgument("Name: ", example_name, 251 ", Feature: ", key, 252 ". Data types don't match. ", 253 "Expected type: ", DataTypeString(dtype), 254 " Feature is: ", ProtoDebugString(f)); 255 } 256 TF_RETURN_IF_ERROR(FeatureDenseCopy(batch_index, example_name, key, dtype, 257 shape, f, 258 (*output_dense_values_tensor)[d])); 259 } else { 260 // If the value is missing, RowDenseCopy the default value. 261 RowDenseCopy(batch_index, dtype, default_value, 262 (*output_dense_values_tensor)[d]); 263 } 264 } 265 266 // Handle sparse features. 267 for (size_t d = 0; d < var_len_features.size(); ++d) { 268 const VarLenFeature& feature_config = var_len_features[d]; 269 const string& key = feature_config.key; 270 const DataType& dtype = feature_config.dtype; 271 const auto& feature_found = feature_dict.find(key); 272 273 const bool feature_has_data = // Found key & data type is set 274 (feature_found != feature_dict.end() && 275 (feature_found->second.kind_case() != Feature::KIND_NOT_SET)); 276 277 if (feature_has_data) { 278 const Feature& f = feature_found->second; 279 bool types_match; 280 TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match)); 281 if (!types_match) { 282 return errors::InvalidArgument("Name: ", example_name, 283 ", Feature: ", key, 284 ". Data types don't match. ", 285 "Expected type: ", DataTypeString(dtype), 286 " Feature is: ", ProtoDebugString(f)); 287 } 288 (*output_sparse_values_tmp)[d][batch_index] = 289 FeatureSparseCopy(batch_index, key, dtype, f); 290 } else { 291 (*output_sparse_values_tmp)[d][batch_index] = 292 Tensor(dtype, TensorShape({0})); 293 } 294 } 295 return Status::OK(); 296 } 297 298 Status GetSparseTensorShapes(const VarLenFeature& var_len_feature, 299 const std::vector<Tensor>& sparse_values_tmp, 300 const int batch_size, 301 VarLenFeatureBatchShapes* output_shapes) { 302 int64 total_num_features = 0; 303 int64 max_num_features = 0; 304 for (int b = 0; b < batch_size; ++b) { 305 const Tensor& t = sparse_values_tmp[b]; 306 const int64 num_elements = t.shape().num_elements(); 307 total_num_features += num_elements; 308 max_num_features = std::max(max_num_features, num_elements); 309 } 310 output_shapes->indices_shape.AddDim(total_num_features); 311 output_shapes->indices_shape.AddDim(2); 312 output_shapes->values_shape.AddDim(total_num_features); 313 output_shapes->max_num_features = max_num_features; 314 return Status::OK(); 315 } 316 317 Status BatchExampleProtoToTensors( 318 const std::vector<const Example*>& examples, 319 const std::vector<string>& names, 320 const std::vector<FixedLenFeature>& fixed_len_features, 321 const std::vector<VarLenFeature>& var_len_features, Allocator* allocator, 322 std::vector<Tensor>* output_dense_values_tensor, 323 std::vector<Tensor>* output_sparse_indices_tensor, 324 std::vector<Tensor>* output_sparse_values_tensor, 325 std::vector<Tensor>* output_sparse_shapes_tensor) { 326 const int batch_size = examples.size(); 327 328 const bool has_names = (!names.empty()); 329 if (has_names) { 330 if (names.size() != examples.size()) { 331 return errors::InvalidArgument( 332 "Expected len(names) == len(examples), but got: ", names.size(), 333 " vs. ", examples.size()); 334 } 335 } 336 337 // We also need a map of Tensor pointers for the SingleExampleProtoToTensors 338 // call. (Is there a better solution here?) 339 std::vector<Tensor*> output_dense_values_tensor_ptrs( 340 fixed_len_features.size()); 341 342 // Preallocate dense_values, since we know their sizes. 343 for (size_t d = 0; d < fixed_len_features.size(); ++d) { 344 const FixedLenFeature& config = fixed_len_features[d]; 345 TensorShape out_shape; 346 out_shape.AddDim(batch_size); 347 const TensorShape& shape = config.shape; 348 const DataType& dtype = config.dtype; 349 for (const int dim : shape.dim_sizes()) out_shape.AddDim(dim); 350 (*output_dense_values_tensor)[d] = Tensor(allocator, dtype, out_shape); 351 output_dense_values_tensor_ptrs[d] = &(*output_dense_values_tensor)[d]; 352 } 353 354 // Temporary vector to hold sparse values. 355 std::vector<std::vector<Tensor>> sparse_values_tmp(var_len_features.size()); 356 357 for (size_t d = 0; d < var_len_features.size(); ++d) { 358 sparse_values_tmp[d] = std::vector<Tensor>(batch_size); 359 } 360 361 for (size_t b = 0; b < examples.size(); ++b) { 362 const Example& ex = *(examples[b]); 363 const string& example_name = (has_names) ? names[b] : "<unknown>"; 364 TF_RETURN_IF_ERROR(SingleExampleProtoToTensors( 365 ex, example_name, b, fixed_len_features, var_len_features, 366 &output_dense_values_tensor_ptrs, &sparse_values_tmp)); 367 } 368 369 for (size_t d = 0; d < var_len_features.size(); ++d) { 370 const VarLenFeature& feature_config = var_len_features[d]; 371 const DataType& dtype = feature_config.dtype; 372 const std::vector<Tensor>& sparse_values_tensor = sparse_values_tmp[d]; 373 374 VarLenFeatureBatchShapes sparse_tensor_batch_shapes; 375 TF_RETURN_IF_ERROR(GetSparseTensorShapes(feature_config, 376 sparse_values_tensor, batch_size, 377 &sparse_tensor_batch_shapes)); 378 const TensorShape& indices_shape = sparse_tensor_batch_shapes.indices_shape; 379 const TensorShape& values_shape = sparse_tensor_batch_shapes.values_shape; 380 381 // Allocate the sparse indices here. 382 (*output_sparse_indices_tensor)[d] = 383 Tensor(allocator, DT_INT64, indices_shape); 384 (*output_sparse_values_tensor)[d] = Tensor(allocator, dtype, values_shape); 385 (*output_sparse_shapes_tensor)[d] = 386 Tensor(allocator, DT_INT64, TensorShape({2})); 387 388 auto shape_t = (*output_sparse_shapes_tensor)[d].vec<int64>(); 389 shape_t(0) = batch_size; 390 shape_t(1) = sparse_tensor_batch_shapes.max_num_features; 391 392 Tensor* sp_indices_d = &(*output_sparse_indices_tensor)[d]; 393 Tensor* sp_values_d = &(*output_sparse_values_tensor)[d]; 394 395 int64 offset = 0; 396 for (int b = 0; b < batch_size; ++b) { 397 const int64 num_elements = CopyIntoSparseTensor( 398 sparse_values_tensor[b], b, offset, sp_indices_d, sp_values_d); 399 offset += num_elements; 400 } 401 } 402 return Status::OK(); 403 } 404 405 Status ParseExampleAttrs::FinishInit() { 406 if (static_cast<size_t>(num_sparse) != sparse_types.size()) { 407 return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)"); 408 } 409 if (static_cast<size_t>(num_dense) != dense_types.size()) { 410 return errors::InvalidArgument("len(dense_keys) != len(dense_types)"); 411 } 412 if (static_cast<size_t>(num_dense) != dense_shapes.size()) { 413 return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)"); 414 } 415 if (num_dense > std::numeric_limits<int32>::max()) { 416 return errors::InvalidArgument("num_dense_ too large"); 417 } 418 for (const DataType& type : dense_types) { 419 TF_RETURN_IF_ERROR(CheckValidType(type)); 420 } 421 for (const DataType& type : sparse_types) { 422 TF_RETURN_IF_ERROR(CheckValidType(type)); 423 } 424 return Status::OK(); 425 } 426 427 Status ParseSingleExampleAttrs::FinishInit() { 428 if (sparse_keys.size() != sparse_types.size()) { 429 return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)"); 430 } 431 if (dense_keys.size() != dense_types.size()) { 432 return errors::InvalidArgument("len(dense_keys) != len(dense_types)"); 433 } 434 if (dense_keys.size() != dense_shapes.size()) { 435 return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)"); 436 } 437 for (const DataType& type : dense_types) { 438 TF_RETURN_IF_ERROR(CheckValidType(type)); 439 } 440 for (const DataType& type : sparse_types) { 441 TF_RETURN_IF_ERROR(CheckValidType(type)); 442 } 443 return Status::OK(); 444 } 445 446 Status ParseSingleSequenceExampleAttrs::FinishInit() { 447 if (static_cast<size_t>(num_context_sparse) != context_sparse_types.size()) { 448 return errors::InvalidArgument( 449 "len(context_sparse_keys) != len(context_sparse_types)"); 450 } 451 if (static_cast<size_t>(num_context_dense) != context_dense_types.size()) { 452 return errors::InvalidArgument( 453 "len(context_dense_keys) != len(context_dense_types)"); 454 } 455 if (static_cast<size_t>(num_context_dense) != context_dense_shapes.size()) { 456 return errors::InvalidArgument( 457 "len(context_dense_keys) != len(context_dense_shapes)"); 458 } 459 if (static_cast<size_t>(num_feature_list_sparse) != 460 feature_list_sparse_types.size()) { 461 return errors::InvalidArgument( 462 "len(feature_list_sparse_keys) != len(feature_list_sparse_types)"); 463 } 464 if (static_cast<size_t>(num_feature_list_dense) != 465 feature_list_dense_types.size()) { 466 return errors::InvalidArgument( 467 "len(feature_list_dense_keys) != " 468 "len(feature_list_dense_types)"); 469 } 470 for (const DataType& type : context_dense_types) { 471 TF_RETURN_IF_ERROR(CheckValidType(type)); 472 } 473 for (const DataType& type : context_sparse_types) { 474 TF_RETURN_IF_ERROR(CheckValidType(type)); 475 } 476 for (const DataType& type : feature_list_dense_types) { 477 TF_RETURN_IF_ERROR(CheckValidType(type)); 478 } 479 for (const DataType& type : feature_list_sparse_types) { 480 TF_RETURN_IF_ERROR(CheckValidType(type)); 481 } 482 return Status::OK(); 483 } 484 485 } // namespace tensorflow 486