Home | History | Annotate | Download | only in tests
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include <memory>
     17 #include <vector>
     18 
     19 #include "tensorflow/compiler/xla/array2d.h"
     20 #include "tensorflow/compiler/xla/client/computation_builder.h"
     21 #include "tensorflow/compiler/xla/client/local_client.h"
     22 #include "tensorflow/compiler/xla/map_util.h"
     23 #include "tensorflow/compiler/xla/service/platform_util.h"
     24 #include "tensorflow/compiler/xla/shape_util.h"
     25 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
     26 #include "tensorflow/compiler/xla/tests/test_macros.h"
     27 #include "tensorflow/compiler/xla/tests/test_utils.h"
     28 #include "tensorflow/core/lib/core/status_test_util.h"
     29 #include "tensorflow/core/lib/gtl/flatmap.h"
     30 #include "tensorflow/core/platform/regexp.h"
     31 #include "tensorflow/core/platform/test.h"
     32 #include "tensorflow/core/platform/types.h"
     33 
     34 namespace xla {
     35 namespace {
     36 namespace se = ::perftools::gputools;
     37 namespace gtl = ::tensorflow::gtl;
     38 
     39 class HloProfileTest : public ClientLibraryTestBase {};
     40 
     41 struct ParsedProfileOutputLine {
     42   int64 cycles;
     43   string cycles_percentage;
     44   double usec;
     45   string flops;
     46   string trops;
     47   string bytes_per_sec;
     48   string bytes_per_cycle;
     49   string opcode;
     50 };
     51 
     52 ::testing::AssertionResult HasFlops(
     53     const ParsedProfileOutputLine& parsed_line) {
     54   if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) {
     55     return ::testing::AssertionSuccess()
     56            << "'flops' field present in  " << parsed_line.opcode << ": '"
     57            << parsed_line.flops << "'";
     58   }
     59 
     60   return ::testing::AssertionFailure()
     61          << "'flops' field absent in  " << parsed_line.opcode << ": '"
     62          << parsed_line.flops << "'";
     63 }
     64 
     65 ::testing::AssertionResult HasTrops(
     66     const ParsedProfileOutputLine& parsed_line) {
     67   if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) {
     68     return ::testing::AssertionSuccess()
     69            << "'trops' field present in  " << parsed_line.opcode << ": '"
     70            << parsed_line.trops << "'";
     71   }
     72 
     73   return ::testing::AssertionFailure()
     74          << "'trops' field absent in  " << parsed_line.opcode << ": '"
     75          << parsed_line.trops << "'";
     76 }
     77 
     78 Status ParseOneProfileOutputLine(
     79     const string& line, bool expect_hlo,
     80     gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results) {
     81   string separator = "[^:]*:: +";
     82   string match_percentage = "\\d+\\.\\d\\d%";
     83   string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
     84   string match_usecs = "([0-9.]+) usec";
     85   string match_flops = "([^ ]+)";
     86   string match_trops = "([^ ]+)";
     87   string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
     88   string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
     89 
     90   // The underlined part is what we're trying to match with match_opcode:
     91   //
     92   //   %dot33 = f32[256,256]{1,0} dot(...)
     93   //                              ^^^
     94 
     95   string match_opcode =
     96       expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])";
     97   string regexp_pattern = tensorflow::strings::StrCat(
     98       " +", match_cycles, separator, match_usecs, separator, match_flops,
     99       separator, match_trops, separator, match_bytes_per_sec, separator,
    100       match_bytes_per_cycle, separator, match_opcode);
    101 
    102   ParsedProfileOutputLine parsed_line;
    103   bool matched = RE2::FullMatch(
    104       line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
    105       &parsed_line.usec, &parsed_line.flops, &parsed_line.trops,
    106       &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle,
    107       &parsed_line.opcode);
    108   if (!matched) {
    109     return tensorflow::errors::InvalidArgument(
    110         "Input did not match regexp.  Input: ", line,
    111         ", Regexp: ", regexp_pattern);
    112   }
    113 
    114   InsertOrDie(parsed_results, parsed_line.opcode, parsed_line);
    115 
    116   return Status::OK();
    117 }
    118 
    119 // Returns void so that we can ASSERT.
    120 void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
    121                             const Computation& computation,
    122                             const Shape& lhs_arg_shape,
    123                             const Shape& rhs_arg_shape) {
    124   LocalService* service = ClientLibrary::GetXlaService(client->platform());
    125   Backend* backend = service->mutable_backend();
    126   se::StreamExecutor* executor = backend->default_stream_executor();
    127   DeviceMemoryAllocator* allocator = backend->memory_allocator();
    128   auto* transfer_manager = backend->transfer_manager();
    129 
    130   TF_ASSERT_OK_AND_ASSIGN(
    131       std::unique_ptr<ScopedShapedBuffer> lhs_arg,
    132       transfer_manager->AllocateScopedShapedBuffer(
    133           lhs_arg_shape, allocator, backend->default_device_ordinal()));
    134   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
    135       executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
    136 
    137   TF_ASSERT_OK_AND_ASSIGN(
    138       std::unique_ptr<ScopedShapedBuffer> rhs_arg,
    139       transfer_manager->AllocateScopedShapedBuffer(
    140           rhs_arg_shape, allocator, backend->default_device_ordinal()));
    141   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
    142       executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
    143 
    144   TF_ASSERT_OK_AND_ASSIGN(
    145       std::unique_ptr<LocalExecutable> local_executable,
    146       client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
    147                       ExecutableBuildOptions()));
    148 
    149   Executable* executable = local_executable->executable();
    150   HloExecutionProfile hlo_execution_profile(
    151       &executable->hlo_profile_printer_data(),
    152       &executable->hlo_profile_index_map());
    153 
    154   TF_ASSERT_OK_AND_ASSIGN(
    155       Backend::StreamPtr stream_ptr,
    156       backend->BorrowStream(backend->default_device_ordinal()));
    157   ExecutableRunOptions exec_run_options;
    158   exec_run_options.set_stream(stream_ptr.get());
    159   exec_run_options.set_allocator(backend->memory_allocator());
    160   exec_run_options.set_intra_op_thread_pool(
    161       backend->eigen_intra_op_thread_pool_device());
    162   ServiceExecutableRunOptions run_options(
    163       exec_run_options, /*borrow_stream=*/nullptr,
    164       backend->eigen_intra_op_thread_pool());
    165   TF_ASSERT_OK_AND_ASSIGN(
    166       auto execution_result,
    167       executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
    168                                   &hlo_execution_profile));
    169   (void)execution_result;
    170 
    171   *profile_output =
    172       hlo_execution_profile.ToString(executor->GetDeviceDescription());
    173 
    174   XLA_VLOG_LINES(4, *profile_output);
    175 }
    176 
    177 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
    178 XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
    179   const int64 m = 256, k = 256, n = 256;
    180   Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
    181   Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
    182 
    183   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
    184                           PlatformUtil::GetDefaultPlatform());
    185   TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
    186                           ClientLibrary::GetOrCreateLocalClient(platform));
    187 
    188   ComputationBuilder builder(client, TestName());
    189   auto result = builder.Tanh(builder.Add(
    190       builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
    191       builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
    192 
    193   TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
    194 
    195   string profile_output;
    196   ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
    197                          rhs_shape);
    198 
    199   std::vector<string> profile_output_lines =
    200       tensorflow::str_util::Split(profile_output, '\n');
    201 
    202   gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
    203 
    204   TF_ASSERT_OK(ParseOneProfileOutputLine(
    205       profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines));
    206 
    207   TF_ASSERT_OK(ParseOneProfileOutputLine(
    208       profile_output_lines[2], /*expect_hlo=*/true, &parsed_profile_lines));
    209 
    210   TF_ASSERT_OK(ParseOneProfileOutputLine(
    211       profile_output_lines[3], /*expect_hlo=*/true, &parsed_profile_lines));
    212 
    213   TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile,
    214                           MaybeFind(parsed_profile_lines, "[total]"));
    215   TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
    216                           MaybeFind(parsed_profile_lines, "add"));
    217   TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile,
    218                           MaybeFind(parsed_profile_lines, "tanh"));
    219 
    220   EXPECT_GT(total_profile.cycles, 0);
    221   EXPECT_EQ(total_profile.cycles_percentage, "100.00%");
    222 
    223   EXPECT_TRUE(HasFlops(total_profile));
    224   EXPECT_TRUE(HasTrops(total_profile));
    225 
    226   EXPECT_GT(total_profile.cycles, dot_profile.cycles);
    227   EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
    228   EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
    229 
    230   EXPECT_TRUE(HasFlops(dot_profile));
    231   EXPECT_FALSE(HasTrops(dot_profile));
    232 
    233   EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
    234   EXPECT_NE(tanh_profile.cycles_percentage, "0.00%");
    235   EXPECT_NE(tanh_profile.cycles_percentage, "100.00%");
    236 
    237   EXPECT_FALSE(HasFlops(tanh_profile));
    238   EXPECT_TRUE(HasTrops(tanh_profile));
    239 }
    240 
    241 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
    242 //
    243 // TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo
    244 // instructions "interior" to while nodes.
    245 XLA_TEST_F(HloProfileTest,
    246            DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) {
    247   const int64 size = 256;
    248   Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size});
    249   Shape while_result_shape =
    250       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), matrix_shape});
    251 
    252   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
    253                           PlatformUtil::GetDefaultPlatform());
    254   TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
    255                           ClientLibrary::GetOrCreateLocalClient(platform));
    256 
    257   Computation condition;
    258   {
    259     ComputationBuilder builder(client, "condition");
    260     auto state = builder.Parameter(0, while_result_shape, "state");
    261     auto iteration = builder.GetTupleElement(state, 0);
    262     builder.Gt(builder.ConstantR0<int32>(5), iteration);
    263     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
    264   }
    265 
    266   Computation body;
    267   {
    268     ComputationBuilder builder(client, "body");
    269     auto state = builder.Parameter(0, while_result_shape, "state");
    270     auto matrix = builder.GetTupleElement(state, 1);
    271     auto next_iteration = builder.Add(builder.GetTupleElement(state, 0),
    272                                       builder.ConstantR0<int32>(1));
    273     builder.Tuple({next_iteration, builder.Add(matrix, matrix)});
    274     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
    275   }
    276 
    277   ComputationBuilder builder(client, TestName());
    278   auto initial_while_state =
    279       builder.Tuple({builder.ConstantR0<int32>(0),
    280                      builder.Parameter(0, matrix_shape, "initial_value")});
    281   auto while_result = builder.While(condition, body, initial_while_state);
    282   builder.Add(builder.GetTupleElement(while_result, 1),
    283               builder.Parameter(1, matrix_shape, "other_value"));
    284 
    285   TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
    286 
    287   string profile_output;
    288   ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape,
    289                          matrix_shape);
    290 
    291   std::vector<string> profile_output_lines =
    292       tensorflow::str_util::Split(profile_output, '\n');
    293 
    294   auto while_body_profile_start =
    295       std::find_if(profile_output_lines.begin(), profile_output_lines.end(),
    296                    [](tensorflow::StringPiece s) {
    297                      return s.starts_with("Execution profile for body");
    298                    });
    299 
    300   ASSERT_NE(while_body_profile_start, profile_output_lines.end());
    301 
    302   gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
    303 
    304   TF_ASSERT_OK(
    305       ParseOneProfileOutputLine(*std::next(while_body_profile_start, 1),
    306                                 /*expect_hlo=*/false, &parsed_profile_lines));
    307 
    308   TF_ASSERT_OK(
    309       ParseOneProfileOutputLine(*std::next(while_body_profile_start, 2),
    310                                 /*expect_hlo=*/true, &parsed_profile_lines));
    311 
    312   TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile,
    313                           MaybeFind(parsed_profile_lines, "[total]"));
    314   TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
    315                           MaybeFind(parsed_profile_lines, "add"));
    316 
    317   EXPECT_GT(total_while_body_profile.cycles, 0);
    318   EXPECT_EQ(total_while_body_profile.opcode, "[total]");
    319   EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%");
    320 
    321   EXPECT_GT(total_while_body_profile.cycles, dot_profile.cycles);
    322   EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
    323   EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
    324 }
    325 }  // namespace
    326 }  // namespace xla
    327 
    328 static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
    329   // Intentional "leak".
    330   char** new_argv = new char*[argc + 2];
    331   for (int i = 0; i < argc; i++) {
    332     new_argv[i] = argv[i];
    333   }
    334 
    335   // We do it this way (as opposed to piping in a modified DebugOptions
    336   // instance) for better end-to-end integration testing.
    337   new_argv[argc] = strdup("--xla_hlo_profile");
    338 
    339   // Fusion can change the Hlo instructions that show up in the final Hlo
    340   // executable, so block it here.
    341   new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion");
    342   return {argc + 2, new_argv};
    343 }
    344 
    345 GTEST_API_ int main(int argc, char** argv) {
    346   std::vector<tensorflow::Flag> flag_list;
    347   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
    348   std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
    349 
    350   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
    351   if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
    352     LOG(ERROR) << "\n" << usage;
    353     return 2;
    354   }
    355 
    356   testing::InitGoogleTest(&argc, argv);
    357   if (argc > 1) {
    358     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
    359     return 2;
    360   }
    361   return RUN_ALL_TESTS();
    362 }
    363