1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <unordered_set> 17 18 #include "tensorflow/core/debug/debug_io_utils.h" 19 20 #include "tensorflow/core/debug/debug_callback_registry.h" 21 #include "tensorflow/core/debug/debug_node_key.h" 22 #include "tensorflow/core/debug/debugger_event_metadata.pb.h" 23 #include "tensorflow/core/framework/summary.pb.h" 24 #include "tensorflow/core/framework/tensor_testutil.h" 25 #include "tensorflow/core/lib/core/notification.h" 26 #include "tensorflow/core/lib/core/status_test_util.h" 27 #include "tensorflow/core/lib/core/threadpool.h" 28 #include "tensorflow/core/lib/io/path.h" 29 #include "tensorflow/core/lib/strings/str_util.h" 30 #include "tensorflow/core/platform/env.h" 31 #include "tensorflow/core/util/event.pb.h" 32 33 namespace tensorflow { 34 namespace { 35 36 class DebugIOUtilsTest : public ::testing::Test { 37 public: 38 void Initialize() { 39 env_ = Env::Default(); 40 41 tensor_a_.reset(new Tensor(DT_FLOAT, TensorShape({2, 2}))); 42 tensor_a_->flat<float>()(0) = 5.0; 43 tensor_a_->flat<float>()(1) = 3.0; 44 tensor_a_->flat<float>()(2) = -1.0; 45 tensor_a_->flat<float>()(3) = 0.0; 46 47 tensor_b_.reset(new Tensor(DT_STRING, TensorShape{2})); 48 tensor_b_->flat<string>()(0) = "corge"; 49 tensor_b_->flat<string>()(1) = "garply"; 50 } 51 52 Env* env_; 53 std::unique_ptr<Tensor> tensor_a_; 54 std::unique_ptr<Tensor> tensor_b_; 55 }; 56 57 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) { 58 DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2", 59 "hidden_1/MatMul", 0, "DebugIdentity"); 60 EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", 61 debug_node_key.device_name); 62 EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name); 63 EXPECT_EQ(0, debug_node_key.output_slot); 64 EXPECT_EQ("DebugIdentity", debug_node_key.debug_op); 65 EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name); 66 EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,device_GPU_2", 67 debug_node_key.device_path); 68 } 69 70 TEST_F(DebugIOUtilsTest, EqualityOfDebugNodeKeys) { 71 const DebugNodeKey debug_node_key_1("/job:worker/replica:1/task:0/gpu:2", 72 "hidden_1/MatMul", 0, "DebugIdentity"); 73 const DebugNodeKey debug_node_key_2("/job:worker/replica:1/task:0/gpu:2", 74 "hidden_1/MatMul", 0, "DebugIdentity"); 75 const DebugNodeKey debug_node_key_3("/job:worker/replica:1/task:0/gpu:2", 76 "hidden_1/BiasAdd", 0, "DebugIdentity"); 77 const DebugNodeKey debug_node_key_4("/job:worker/replica:1/task:0/gpu:2", 78 "hidden_1/MatMul", 0, 79 "DebugNumericSummary"); 80 EXPECT_EQ(debug_node_key_1, debug_node_key_2); 81 EXPECT_NE(debug_node_key_1, debug_node_key_3); 82 EXPECT_NE(debug_node_key_1, debug_node_key_4); 83 EXPECT_NE(debug_node_key_3, debug_node_key_4); 84 } 85 86 TEST_F(DebugIOUtilsTest, DebugNodeKeysIsHashable) { 87 const DebugNodeKey debug_node_key_1("/job:worker/replica:1/task:0/gpu:2", 88 "hidden_1/MatMul", 0, "DebugIdentity"); 89 const DebugNodeKey debug_node_key_2("/job:worker/replica:1/task:0/gpu:2", 90 "hidden_1/MatMul", 0, "DebugIdentity"); 91 const DebugNodeKey debug_node_key_3("/job:worker/replica:1/task:0/gpu:2", 92 "hidden_1/BiasAdd", 0, "DebugIdentity"); 93 94 std::unordered_set<DebugNodeKey> keys; 95 keys.insert(debug_node_key_1); 96 ASSERT_EQ(1, keys.size()); 97 keys.insert(debug_node_key_3); 98 ASSERT_EQ(2, keys.size()); 99 keys.erase(debug_node_key_2); 100 ASSERT_EQ(1, keys.size()); 101 } 102 103 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) { 104 Initialize(); 105 106 const string test_dir = testing::TmpDir(); 107 108 // Append levels of nonexisting directories, to test that the function can 109 // create directories. 110 const uint64 wall_time = env_->NowMicros(); 111 const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", 112 "foo/bar/qux/tensor_a", 0, "DebugIdentity"); 113 114 string dump_file_path; 115 TF_ASSERT_OK(DebugFileIO::DumpTensorToDir( 116 kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_path)); 117 118 // Read the file into a Event proto. 119 Event event; 120 TF_ASSERT_OK(ReadEventFromFile(dump_file_path, &event)); 121 122 ASSERT_GE(wall_time, event.wall_time()); 123 ASSERT_EQ(1, event.summary().value().size()); 124 ASSERT_EQ(kDebugNodeKey.debug_node_name, 125 event.summary().value(0).node_name()); 126 127 Tensor a_prime(DT_FLOAT); 128 ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor())); 129 130 // Verify tensor shape and value. 131 ASSERT_EQ(tensor_a_->shape(), a_prime.shape()); 132 for (int i = 0; i < a_prime.flat<float>().size(); ++i) { 133 ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i)); 134 } 135 136 // Tear down temporary file and directories. 137 int64 undeleted_files = 0; 138 int64 undeleted_dirs = 0; 139 ASSERT_TRUE( 140 env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs) 141 .ok()); 142 ASSERT_EQ(0, undeleted_files); 143 ASSERT_EQ(0, undeleted_dirs); 144 } 145 146 TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) { 147 Initialize(); 148 149 const string test_dir = testing::TmpDir(); 150 151 const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", 152 "quux/grault/tensor_b", 1, "DebugIdentity"); 153 const uint64 wall_time = env_->NowMicros(); 154 155 string dump_file_name; 156 Status s = DebugFileIO::DumpTensorToDir(kDebugNodeKey, *tensor_b_, wall_time, 157 test_dir, &dump_file_name); 158 ASSERT_TRUE(s.ok()); 159 160 // Read the file into a Event proto. 161 Event event; 162 TF_ASSERT_OK(ReadEventFromFile(dump_file_name, &event)); 163 164 ASSERT_GE(wall_time, event.wall_time()); 165 ASSERT_EQ(1, event.summary().value().size()); 166 ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag()); 167 ASSERT_EQ(kDebugNodeKey.debug_node_name, 168 event.summary().value(0).node_name()); 169 170 // Determine and validate some information from the metadata. 171 third_party::tensorflow::core::debug::DebuggerEventMetadata metadata; 172 auto status = tensorflow::protobuf::util::JsonStringToMessage( 173 event.summary().value(0).metadata().plugin_data().content(), &metadata); 174 ASSERT_TRUE(status.ok()); 175 ASSERT_EQ(kDebugNodeKey.device_name, metadata.device()); 176 ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot()); 177 178 Tensor b_prime(DT_STRING); 179 ASSERT_TRUE(b_prime.FromProto(event.summary().value(0).tensor())); 180 181 // Verify tensor shape and value. 182 ASSERT_EQ(tensor_b_->shape(), b_prime.shape()); 183 for (int i = 0; i < b_prime.flat<string>().size(); ++i) { 184 ASSERT_EQ(tensor_b_->flat<string>()(i), b_prime.flat<string>()(i)); 185 } 186 187 // Tear down temporary file and directories. 188 int64 undeleted_files = 0; 189 int64 undeleted_dirs = 0; 190 ASSERT_TRUE( 191 env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs) 192 .ok()); 193 ASSERT_EQ(0, undeleted_files); 194 ASSERT_EQ(0, undeleted_dirs); 195 } 196 197 TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) { 198 Initialize(); 199 200 // First, create the file at the path. 201 const string test_dir = testing::TmpDir(); 202 const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0"; 203 const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0, 204 "DebugIdentity"); 205 const string txt_file_dir = 206 io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName)); 207 const string txt_file_name = io::JoinPath(txt_file_dir, "baz"); 208 if (!env_->FileExists(txt_file_dir).ok()) { 209 ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok()); 210 } 211 ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code()); 212 213 std::unique_ptr<WritableFile> file; 214 ASSERT_TRUE(env_->NewWritableFile(txt_file_name, &file).ok()); 215 TF_EXPECT_OK(file->Append("text in baz")); 216 TF_EXPECT_OK(file->Flush()); 217 TF_ASSERT_OK(file->Close()); 218 219 // Verify that the path exists and that it is a file, not a directory. 220 ASSERT_TRUE(env_->FileExists(txt_file_name).ok()); 221 ASSERT_FALSE(env_->IsDirectory(txt_file_name).ok()); 222 223 // Second, try to dump the tensor to a path that requires "baz" to be a 224 // directory, which should lead to an error. 225 226 const uint64 wall_time = env_->NowMicros(); 227 228 string dump_file_name; 229 Status s = DebugFileIO::DumpTensorToDir(kDebugNodeKey, *tensor_a_, wall_time, 230 test_dir, &dump_file_name); 231 ASSERT_FALSE(s.ok()); 232 233 // Tear down temporary file and directories. 234 int64 undeleted_files = 0; 235 int64 undeleted_dirs = 0; 236 ASSERT_TRUE( 237 env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs) 238 .ok()); 239 ASSERT_EQ(0, undeleted_files); 240 ASSERT_EQ(0, undeleted_dirs); 241 } 242 243 TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) { 244 Initialize(); 245 246 const int kNumDumpRoots = 3; 247 const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", 248 "foo/bar/qux/tensor_a", 0, "DebugIdentity"); 249 const uint64 wall_time = env_->NowMicros(); 250 251 std::vector<string> dump_roots; 252 std::vector<string> dump_file_paths; 253 std::vector<string> urls; 254 for (int i = 0; i < kNumDumpRoots; ++i) { 255 string dump_root = strings::StrCat(testing::TmpDir(), "/", i); 256 257 dump_roots.push_back(dump_root); 258 dump_file_paths.push_back( 259 DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time)); 260 urls.push_back(strings::StrCat("file://", dump_root)); 261 } 262 263 for (int i = 1; i < kNumDumpRoots; ++i) { 264 ASSERT_NE(dump_roots[0], dump_roots[i]); 265 } 266 267 Status s = 268 DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls); 269 ASSERT_TRUE(s.ok()); 270 271 // Try reading the file into a Event proto. 272 for (int i = 0; i < kNumDumpRoots; ++i) { 273 // Read the file into a Event proto. 274 Event event; 275 TF_ASSERT_OK(ReadEventFromFile(dump_file_paths[i], &event)); 276 277 ASSERT_GE(wall_time, event.wall_time()); 278 ASSERT_EQ(1, event.summary().value().size()); 279 ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag()); 280 ASSERT_EQ(kDebugNodeKey.debug_node_name, 281 event.summary().value(0).node_name()); 282 283 // Determine and validate some information from the metadata. 284 third_party::tensorflow::core::debug::DebuggerEventMetadata metadata; 285 auto status = tensorflow::protobuf::util::JsonStringToMessage( 286 event.summary().value(0).metadata().plugin_data().content(), &metadata); 287 ASSERT_TRUE(status.ok()); 288 ASSERT_EQ(kDebugNodeKey.device_name, metadata.device()); 289 ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot()); 290 291 Tensor a_prime(DT_FLOAT); 292 ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor())); 293 294 // Verify tensor shape and value. 295 ASSERT_EQ(tensor_a_->shape(), a_prime.shape()); 296 for (int i = 0; i < a_prime.flat<float>().size(); ++i) { 297 ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i)); 298 } 299 } 300 301 // Tear down temporary file and directories. 302 for (int i = 0; i < kNumDumpRoots; ++i) { 303 int64 undeleted_files = 0; 304 int64 undeleted_dirs = 0; 305 ASSERT_TRUE(env_->DeleteRecursively(dump_roots[i], &undeleted_files, 306 &undeleted_dirs) 307 .ok()); 308 ASSERT_EQ(0, undeleted_files); 309 ASSERT_EQ(0, undeleted_dirs); 310 } 311 } 312 313 TEST_F(DebugIOUtilsTest, PublishTensorToMemoryCallback) { 314 Initialize(); 315 316 const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", 317 "foo/bar/qux/tensor_a", 0, "DebugIdentity"); 318 const uint64 wall_time = env_->NowMicros(); 319 320 bool called = false; 321 std::vector<string> urls = {"memcbk://test_callback"}; 322 ; 323 324 auto* callback_registry = DebugCallbackRegistry::singleton(); 325 callback_registry->RegisterCallback( 326 "test_callback", [this, &kDebugNodeKey, &called](const DebugNodeKey& key, 327 const Tensor& tensor) { 328 called = true; 329 ASSERT_EQ(kDebugNodeKey.device_name, key.device_name); 330 ASSERT_EQ(kDebugNodeKey.node_name, key.node_name); 331 ASSERT_EQ(tensor_a_->shape(), tensor.shape()); 332 for (int i = 0; i < tensor.flat<float>().size(); ++i) { 333 ASSERT_EQ(tensor_a_->flat<float>()(i), tensor.flat<float>()(i)); 334 } 335 }); 336 337 Status s = 338 DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls); 339 ASSERT_TRUE(s.ok()); 340 ASSERT_TRUE(called); 341 342 callback_registry->UnregisterCallback("test_callback"); 343 } 344 345 TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) { 346 Initialize(); 347 348 const int kConcurrentPubs = 3; 349 const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", 350 "tensor_a", 0, "DebugIdentity"); 351 352 thread::ThreadPool* tp = 353 new thread::ThreadPool(Env::Default(), "test", kConcurrentPubs); 354 const uint64 wall_time = env_->NowMicros(); 355 const string dump_root_base = testing::TmpDir(); 356 357 mutex mu; 358 std::vector<string> dump_roots GUARDED_BY(mu); 359 std::vector<string> dump_file_paths GUARDED_BY(mu); 360 361 int dump_count GUARDED_BY(mu) = 0; 362 int done_count GUARDED_BY(mu) = 0; 363 Notification all_done; 364 365 auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots, 366 &dump_file_paths, &wall_time, &kDebugNodeKey, &kConcurrentPubs, 367 &all_done]() { 368 // "gumpy" is the shared directory part of the path. 369 string dump_root; 370 string debug_url; 371 { 372 mutex_lock l(mu); 373 dump_root = 374 strings::StrCat(dump_root_base, "grumpy/", "dump_", dump_count++); 375 376 dump_roots.push_back(dump_root); 377 dump_file_paths.push_back( 378 DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time)); 379 380 debug_url = strings::StrCat("file://", dump_root); 381 } 382 383 std::vector<string> urls; 384 urls.push_back(debug_url); 385 386 Status s = 387 DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls); 388 ASSERT_TRUE(s.ok()); 389 390 { 391 mutex_lock l(mu); 392 393 done_count++; 394 if (done_count == kConcurrentPubs) { 395 all_done.Notify(); 396 } 397 } 398 }; 399 400 for (int i = 0; i < kConcurrentPubs; ++i) { 401 tp->Schedule(fn); 402 } 403 404 // Wait for all dumping calls to finish. 405 all_done.WaitForNotification(); 406 delete tp; 407 408 { 409 mutex_lock l(mu); 410 411 for (int i = 1; i < kConcurrentPubs; ++i) { 412 ASSERT_NE(dump_roots[0], dump_roots[i]); 413 } 414 415 // Try reading the file into a Event proto. 416 for (int i = 0; i < kConcurrentPubs; ++i) { 417 // Read the file into a Event proto. 418 Event event; 419 TF_ASSERT_OK(ReadEventFromFile(dump_file_paths[i], &event)); 420 421 ASSERT_GE(wall_time, event.wall_time()); 422 ASSERT_EQ(1, event.summary().value().size()); 423 ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag()); 424 ASSERT_EQ(kDebugNodeKey.debug_node_name, 425 event.summary().value(0).node_name()); 426 427 // Determine and validate some information from the metadata. 428 third_party::tensorflow::core::debug::DebuggerEventMetadata metadata; 429 auto status = tensorflow::protobuf::util::JsonStringToMessage( 430 event.summary().value(0).metadata().plugin_data().content(), 431 &metadata); 432 ASSERT_TRUE(status.ok()); 433 ASSERT_EQ(kDebugNodeKey.device_name, metadata.device()); 434 ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot()); 435 436 Tensor a_prime(DT_FLOAT); 437 ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor())); 438 439 // Verify tensor shape and value. 440 ASSERT_EQ(tensor_a_->shape(), a_prime.shape()); 441 for (int i = 0; i < a_prime.flat<float>().size(); ++i) { 442 ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i)); 443 } 444 } 445 446 // Tear down temporary file and directories. 447 int64 undeleted_files = 0; 448 int64 undeleted_dirs = 0; 449 ASSERT_TRUE(env_->DeleteRecursively(dump_root_base, &undeleted_files, 450 &undeleted_dirs) 451 .ok()); 452 ASSERT_EQ(0, undeleted_files); 453 ASSERT_EQ(0, undeleted_dirs); 454 } 455 } 456 457 } // namespace 458 } // namespace tensorflow 459