Home | History | Annotate | Download | only in debug
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include <unordered_set>
     17 
     18 #include "tensorflow/core/debug/debug_io_utils.h"
     19 
     20 #include "tensorflow/core/debug/debug_callback_registry.h"
     21 #include "tensorflow/core/debug/debug_node_key.h"
     22 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
     23 #include "tensorflow/core/framework/summary.pb.h"
     24 #include "tensorflow/core/framework/tensor_testutil.h"
     25 #include "tensorflow/core/lib/core/notification.h"
     26 #include "tensorflow/core/lib/core/status_test_util.h"
     27 #include "tensorflow/core/lib/core/threadpool.h"
     28 #include "tensorflow/core/lib/io/path.h"
     29 #include "tensorflow/core/lib/strings/str_util.h"
     30 #include "tensorflow/core/platform/env.h"
     31 #include "tensorflow/core/util/event.pb.h"
     32 
     33 namespace tensorflow {
     34 namespace {
     35 
     36 class DebugIOUtilsTest : public ::testing::Test {
     37  public:
     38   void Initialize() {
     39     env_ = Env::Default();
     40 
     41     tensor_a_.reset(new Tensor(DT_FLOAT, TensorShape({2, 2})));
     42     tensor_a_->flat<float>()(0) = 5.0;
     43     tensor_a_->flat<float>()(1) = 3.0;
     44     tensor_a_->flat<float>()(2) = -1.0;
     45     tensor_a_->flat<float>()(3) = 0.0;
     46 
     47     tensor_b_.reset(new Tensor(DT_STRING, TensorShape{2}));
     48     tensor_b_->flat<string>()(0) = "corge";
     49     tensor_b_->flat<string>()(1) = "garply";
     50   }
     51 
     52   Env* env_;
     53   std::unique_ptr<Tensor> tensor_a_;
     54   std::unique_ptr<Tensor> tensor_b_;
     55 };
     56 
     57 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
     58   DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
     59                               "hidden_1/MatMul", 0, "DebugIdentity");
     60   EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2",
     61             debug_node_key.device_name);
     62   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
     63   EXPECT_EQ(0, debug_node_key.output_slot);
     64   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
     65   EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
     66   EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,device_GPU_2",
     67             debug_node_key.device_path);
     68 }
     69 
     70 TEST_F(DebugIOUtilsTest, EqualityOfDebugNodeKeys) {
     71   const DebugNodeKey debug_node_key_1("/job:worker/replica:1/task:0/gpu:2",
     72                                       "hidden_1/MatMul", 0, "DebugIdentity");
     73   const DebugNodeKey debug_node_key_2("/job:worker/replica:1/task:0/gpu:2",
     74                                       "hidden_1/MatMul", 0, "DebugIdentity");
     75   const DebugNodeKey debug_node_key_3("/job:worker/replica:1/task:0/gpu:2",
     76                                       "hidden_1/BiasAdd", 0, "DebugIdentity");
     77   const DebugNodeKey debug_node_key_4("/job:worker/replica:1/task:0/gpu:2",
     78                                       "hidden_1/MatMul", 0,
     79                                       "DebugNumericSummary");
     80   EXPECT_EQ(debug_node_key_1, debug_node_key_2);
     81   EXPECT_NE(debug_node_key_1, debug_node_key_3);
     82   EXPECT_NE(debug_node_key_1, debug_node_key_4);
     83   EXPECT_NE(debug_node_key_3, debug_node_key_4);
     84 }
     85 
     86 TEST_F(DebugIOUtilsTest, DebugNodeKeysIsHashable) {
     87   const DebugNodeKey debug_node_key_1("/job:worker/replica:1/task:0/gpu:2",
     88                                       "hidden_1/MatMul", 0, "DebugIdentity");
     89   const DebugNodeKey debug_node_key_2("/job:worker/replica:1/task:0/gpu:2",
     90                                       "hidden_1/MatMul", 0, "DebugIdentity");
     91   const DebugNodeKey debug_node_key_3("/job:worker/replica:1/task:0/gpu:2",
     92                                       "hidden_1/BiasAdd", 0, "DebugIdentity");
     93 
     94   std::unordered_set<DebugNodeKey> keys;
     95   keys.insert(debug_node_key_1);
     96   ASSERT_EQ(1, keys.size());
     97   keys.insert(debug_node_key_3);
     98   ASSERT_EQ(2, keys.size());
     99   keys.erase(debug_node_key_2);
    100   ASSERT_EQ(1, keys.size());
    101 }
    102 
    103 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
    104   Initialize();
    105 
    106   const string test_dir = testing::TmpDir();
    107 
    108   // Append levels of nonexisting directories, to test that the function can
    109   // create directories.
    110   const uint64 wall_time = env_->NowMicros();
    111   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
    112                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
    113 
    114   string dump_file_path;
    115   TF_ASSERT_OK(DebugFileIO::DumpTensorToDir(
    116       kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_path));
    117 
    118   // Read the file into a Event proto.
    119   Event event;
    120   TF_ASSERT_OK(ReadEventFromFile(dump_file_path, &event));
    121 
    122   ASSERT_GE(wall_time, event.wall_time());
    123   ASSERT_EQ(1, event.summary().value().size());
    124   ASSERT_EQ(kDebugNodeKey.debug_node_name,
    125             event.summary().value(0).node_name());
    126 
    127   Tensor a_prime(DT_FLOAT);
    128   ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
    129 
    130   // Verify tensor shape and value.
    131   ASSERT_EQ(tensor_a_->shape(), a_prime.shape());
    132   for (int i = 0; i < a_prime.flat<float>().size(); ++i) {
    133     ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i));
    134   }
    135 
    136   // Tear down temporary file and directories.
    137   int64 undeleted_files = 0;
    138   int64 undeleted_dirs = 0;
    139   ASSERT_TRUE(
    140       env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs)
    141           .ok());
    142   ASSERT_EQ(0, undeleted_files);
    143   ASSERT_EQ(0, undeleted_dirs);
    144 }
    145 
    146 TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
    147   Initialize();
    148 
    149   const string test_dir = testing::TmpDir();
    150 
    151   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
    152                                    "quux/grault/tensor_b", 1, "DebugIdentity");
    153   const uint64 wall_time = env_->NowMicros();
    154 
    155   string dump_file_name;
    156   Status s = DebugFileIO::DumpTensorToDir(kDebugNodeKey, *tensor_b_, wall_time,
    157                                           test_dir, &dump_file_name);
    158   ASSERT_TRUE(s.ok());
    159 
    160   // Read the file into a Event proto.
    161   Event event;
    162   TF_ASSERT_OK(ReadEventFromFile(dump_file_name, &event));
    163 
    164   ASSERT_GE(wall_time, event.wall_time());
    165   ASSERT_EQ(1, event.summary().value().size());
    166   ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag());
    167   ASSERT_EQ(kDebugNodeKey.debug_node_name,
    168             event.summary().value(0).node_name());
    169 
    170   // Determine and validate some information from the metadata.
    171   third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
    172   auto status = tensorflow::protobuf::util::JsonStringToMessage(
    173       event.summary().value(0).metadata().plugin_data().content(), &metadata);
    174   ASSERT_TRUE(status.ok());
    175   ASSERT_EQ(kDebugNodeKey.device_name, metadata.device());
    176   ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot());
    177 
    178   Tensor b_prime(DT_STRING);
    179   ASSERT_TRUE(b_prime.FromProto(event.summary().value(0).tensor()));
    180 
    181   // Verify tensor shape and value.
    182   ASSERT_EQ(tensor_b_->shape(), b_prime.shape());
    183   for (int i = 0; i < b_prime.flat<string>().size(); ++i) {
    184     ASSERT_EQ(tensor_b_->flat<string>()(i), b_prime.flat<string>()(i));
    185   }
    186 
    187   // Tear down temporary file and directories.
    188   int64 undeleted_files = 0;
    189   int64 undeleted_dirs = 0;
    190   ASSERT_TRUE(
    191       env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs)
    192           .ok());
    193   ASSERT_EQ(0, undeleted_files);
    194   ASSERT_EQ(0, undeleted_dirs);
    195 }
    196 
    197 TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
    198   Initialize();
    199 
    200   // First, create the file at the path.
    201   const string test_dir = testing::TmpDir();
    202   const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
    203   const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
    204                                    "DebugIdentity");
    205   const string txt_file_dir =
    206       io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
    207   const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
    208   if (!env_->FileExists(txt_file_dir).ok()) {
    209     ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
    210   }
    211   ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code());
    212 
    213   std::unique_ptr<WritableFile> file;
    214   ASSERT_TRUE(env_->NewWritableFile(txt_file_name, &file).ok());
    215   TF_EXPECT_OK(file->Append("text in baz"));
    216   TF_EXPECT_OK(file->Flush());
    217   TF_ASSERT_OK(file->Close());
    218 
    219   // Verify that the path exists and that it is a file, not a directory.
    220   ASSERT_TRUE(env_->FileExists(txt_file_name).ok());
    221   ASSERT_FALSE(env_->IsDirectory(txt_file_name).ok());
    222 
    223   // Second, try to dump the tensor to a path that requires "baz" to be a
    224   // directory, which should lead to an error.
    225 
    226   const uint64 wall_time = env_->NowMicros();
    227 
    228   string dump_file_name;
    229   Status s = DebugFileIO::DumpTensorToDir(kDebugNodeKey, *tensor_a_, wall_time,
    230                                           test_dir, &dump_file_name);
    231   ASSERT_FALSE(s.ok());
    232 
    233   // Tear down temporary file and directories.
    234   int64 undeleted_files = 0;
    235   int64 undeleted_dirs = 0;
    236   ASSERT_TRUE(
    237       env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs)
    238           .ok());
    239   ASSERT_EQ(0, undeleted_files);
    240   ASSERT_EQ(0, undeleted_dirs);
    241 }
    242 
    243 TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
    244   Initialize();
    245 
    246   const int kNumDumpRoots = 3;
    247   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
    248                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
    249   const uint64 wall_time = env_->NowMicros();
    250 
    251   std::vector<string> dump_roots;
    252   std::vector<string> dump_file_paths;
    253   std::vector<string> urls;
    254   for (int i = 0; i < kNumDumpRoots; ++i) {
    255     string dump_root = strings::StrCat(testing::TmpDir(), "/", i);
    256 
    257     dump_roots.push_back(dump_root);
    258     dump_file_paths.push_back(
    259         DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time));
    260     urls.push_back(strings::StrCat("file://", dump_root));
    261   }
    262 
    263   for (int i = 1; i < kNumDumpRoots; ++i) {
    264     ASSERT_NE(dump_roots[0], dump_roots[i]);
    265   }
    266 
    267   Status s =
    268       DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls);
    269   ASSERT_TRUE(s.ok());
    270 
    271   // Try reading the file into a Event proto.
    272   for (int i = 0; i < kNumDumpRoots; ++i) {
    273     // Read the file into a Event proto.
    274     Event event;
    275     TF_ASSERT_OK(ReadEventFromFile(dump_file_paths[i], &event));
    276 
    277     ASSERT_GE(wall_time, event.wall_time());
    278     ASSERT_EQ(1, event.summary().value().size());
    279     ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag());
    280     ASSERT_EQ(kDebugNodeKey.debug_node_name,
    281               event.summary().value(0).node_name());
    282 
    283     // Determine and validate some information from the metadata.
    284     third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
    285     auto status = tensorflow::protobuf::util::JsonStringToMessage(
    286         event.summary().value(0).metadata().plugin_data().content(), &metadata);
    287     ASSERT_TRUE(status.ok());
    288     ASSERT_EQ(kDebugNodeKey.device_name, metadata.device());
    289     ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot());
    290 
    291     Tensor a_prime(DT_FLOAT);
    292     ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
    293 
    294     // Verify tensor shape and value.
    295     ASSERT_EQ(tensor_a_->shape(), a_prime.shape());
    296     for (int i = 0; i < a_prime.flat<float>().size(); ++i) {
    297       ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i));
    298     }
    299   }
    300 
    301   // Tear down temporary file and directories.
    302   for (int i = 0; i < kNumDumpRoots; ++i) {
    303     int64 undeleted_files = 0;
    304     int64 undeleted_dirs = 0;
    305     ASSERT_TRUE(env_->DeleteRecursively(dump_roots[i], &undeleted_files,
    306                                         &undeleted_dirs)
    307                     .ok());
    308     ASSERT_EQ(0, undeleted_files);
    309     ASSERT_EQ(0, undeleted_dirs);
    310   }
    311 }
    312 
    313 TEST_F(DebugIOUtilsTest, PublishTensorToMemoryCallback) {
    314   Initialize();
    315 
    316   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
    317                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
    318   const uint64 wall_time = env_->NowMicros();
    319 
    320   bool called = false;
    321   std::vector<string> urls = {"memcbk://test_callback"};
    322   ;
    323 
    324   auto* callback_registry = DebugCallbackRegistry::singleton();
    325   callback_registry->RegisterCallback(
    326       "test_callback", [this, &kDebugNodeKey, &called](const DebugNodeKey& key,
    327                                                        const Tensor& tensor) {
    328         called = true;
    329         ASSERT_EQ(kDebugNodeKey.device_name, key.device_name);
    330         ASSERT_EQ(kDebugNodeKey.node_name, key.node_name);
    331         ASSERT_EQ(tensor_a_->shape(), tensor.shape());
    332         for (int i = 0; i < tensor.flat<float>().size(); ++i) {
    333           ASSERT_EQ(tensor_a_->flat<float>()(i), tensor.flat<float>()(i));
    334         }
    335       });
    336 
    337   Status s =
    338       DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls);
    339   ASSERT_TRUE(s.ok());
    340   ASSERT_TRUE(called);
    341 
    342   callback_registry->UnregisterCallback("test_callback");
    343 }
    344 
    345 TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
    346   Initialize();
    347 
    348   const int kConcurrentPubs = 3;
    349   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
    350                                    "tensor_a", 0, "DebugIdentity");
    351 
    352   thread::ThreadPool* tp =
    353       new thread::ThreadPool(Env::Default(), "test", kConcurrentPubs);
    354   const uint64 wall_time = env_->NowMicros();
    355   const string dump_root_base = testing::TmpDir();
    356 
    357   mutex mu;
    358   std::vector<string> dump_roots GUARDED_BY(mu);
    359   std::vector<string> dump_file_paths GUARDED_BY(mu);
    360 
    361   int dump_count GUARDED_BY(mu) = 0;
    362   int done_count GUARDED_BY(mu) = 0;
    363   Notification all_done;
    364 
    365   auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
    366              &dump_file_paths, &wall_time, &kDebugNodeKey, &kConcurrentPubs,
    367              &all_done]() {
    368     // "gumpy" is the shared directory part of the path.
    369     string dump_root;
    370     string debug_url;
    371     {
    372       mutex_lock l(mu);
    373       dump_root =
    374           strings::StrCat(dump_root_base, "grumpy/", "dump_", dump_count++);
    375 
    376       dump_roots.push_back(dump_root);
    377       dump_file_paths.push_back(
    378           DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time));
    379 
    380       debug_url = strings::StrCat("file://", dump_root);
    381     }
    382 
    383     std::vector<string> urls;
    384     urls.push_back(debug_url);
    385 
    386     Status s =
    387         DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls);
    388     ASSERT_TRUE(s.ok());
    389 
    390     {
    391       mutex_lock l(mu);
    392 
    393       done_count++;
    394       if (done_count == kConcurrentPubs) {
    395         all_done.Notify();
    396       }
    397     }
    398   };
    399 
    400   for (int i = 0; i < kConcurrentPubs; ++i) {
    401     tp->Schedule(fn);
    402   }
    403 
    404   // Wait for all dumping calls to finish.
    405   all_done.WaitForNotification();
    406   delete tp;
    407 
    408   {
    409     mutex_lock l(mu);
    410 
    411     for (int i = 1; i < kConcurrentPubs; ++i) {
    412       ASSERT_NE(dump_roots[0], dump_roots[i]);
    413     }
    414 
    415     // Try reading the file into a Event proto.
    416     for (int i = 0; i < kConcurrentPubs; ++i) {
    417       // Read the file into a Event proto.
    418       Event event;
    419       TF_ASSERT_OK(ReadEventFromFile(dump_file_paths[i], &event));
    420 
    421       ASSERT_GE(wall_time, event.wall_time());
    422       ASSERT_EQ(1, event.summary().value().size());
    423       ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag());
    424       ASSERT_EQ(kDebugNodeKey.debug_node_name,
    425                 event.summary().value(0).node_name());
    426 
    427       // Determine and validate some information from the metadata.
    428       third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
    429       auto status = tensorflow::protobuf::util::JsonStringToMessage(
    430           event.summary().value(0).metadata().plugin_data().content(),
    431           &metadata);
    432       ASSERT_TRUE(status.ok());
    433       ASSERT_EQ(kDebugNodeKey.device_name, metadata.device());
    434       ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot());
    435 
    436       Tensor a_prime(DT_FLOAT);
    437       ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
    438 
    439       // Verify tensor shape and value.
    440       ASSERT_EQ(tensor_a_->shape(), a_prime.shape());
    441       for (int i = 0; i < a_prime.flat<float>().size(); ++i) {
    442         ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i));
    443       }
    444     }
    445 
    446     // Tear down temporary file and directories.
    447     int64 undeleted_files = 0;
    448     int64 undeleted_dirs = 0;
    449     ASSERT_TRUE(env_->DeleteRecursively(dump_root_base, &undeleted_files,
    450                                         &undeleted_dirs)
    451                     .ok());
    452     ASSERT_EQ(0, undeleted_files);
    453     ASSERT_EQ(0, undeleted_dirs);
    454   }
    455 }
    456 
    457 }  // namespace
    458 }  // namespace tensorflow
    459