1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <memory> 17 #include <vector> 18 19 #include "tensorflow/compiler/xla/array2d.h" 20 #include "tensorflow/compiler/xla/client/computation_builder.h" 21 #include "tensorflow/compiler/xla/client/local_client.h" 22 #include "tensorflow/compiler/xla/map_util.h" 23 #include "tensorflow/compiler/xla/service/platform_util.h" 24 #include "tensorflow/compiler/xla/shape_util.h" 25 #include "tensorflow/compiler/xla/tests/client_library_test_base.h" 26 #include "tensorflow/compiler/xla/tests/test_macros.h" 27 #include "tensorflow/compiler/xla/tests/test_utils.h" 28 #include "tensorflow/core/lib/core/status_test_util.h" 29 #include "tensorflow/core/lib/gtl/flatmap.h" 30 #include "tensorflow/core/platform/regexp.h" 31 #include "tensorflow/core/platform/test.h" 32 #include "tensorflow/core/platform/types.h" 33 34 namespace xla { 35 namespace { 36 namespace se = ::perftools::gputools; 37 namespace gtl = ::tensorflow::gtl; 38 39 class HloProfileTest : public ClientLibraryTestBase {}; 40 41 struct ParsedProfileOutputLine { 42 int64 cycles; 43 string cycles_percentage; 44 double usec; 45 string flops; 46 string trops; 47 string bytes_per_sec; 48 string bytes_per_cycle; 49 string opcode; 50 }; 51 52 ::testing::AssertionResult HasFlops( 53 const ParsedProfileOutputLine& parsed_line) { 54 if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) { 55 return ::testing::AssertionSuccess() 56 << "'flops' field present in " << parsed_line.opcode << ": '" 57 << parsed_line.flops << "'"; 58 } 59 60 return ::testing::AssertionFailure() 61 << "'flops' field absent in " << parsed_line.opcode << ": '" 62 << parsed_line.flops << "'"; 63 } 64 65 ::testing::AssertionResult HasTrops( 66 const ParsedProfileOutputLine& parsed_line) { 67 if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) { 68 return ::testing::AssertionSuccess() 69 << "'trops' field present in " << parsed_line.opcode << ": '" 70 << parsed_line.trops << "'"; 71 } 72 73 return ::testing::AssertionFailure() 74 << "'trops' field absent in " << parsed_line.opcode << ": '" 75 << parsed_line.trops << "'"; 76 } 77 78 Status ParseOneProfileOutputLine( 79 const string& line, bool expect_hlo, 80 gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results) { 81 string separator = "[^:]*:: +"; 82 string match_percentage = "\\d+\\.\\d\\d%"; 83 string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)"; 84 string match_usecs = "([0-9.]+) usec"; 85 string match_flops = "([^ ]+)"; 86 string match_trops = "([^ ]+)"; 87 string match_bytes_per_sec = "([0-9.TGMKi]+)B/s"; 88 string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle"; 89 90 // The underlined part is what we're trying to match with match_opcode: 91 // 92 // %dot33 = f32[256,256]{1,0} dot(...) 93 // ^^^ 94 95 string match_opcode = 96 expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])"; 97 string regexp_pattern = tensorflow::strings::StrCat( 98 " +", match_cycles, separator, match_usecs, separator, match_flops, 99 separator, match_trops, separator, match_bytes_per_sec, separator, 100 match_bytes_per_cycle, separator, match_opcode); 101 102 ParsedProfileOutputLine parsed_line; 103 bool matched = RE2::FullMatch( 104 line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage, 105 &parsed_line.usec, &parsed_line.flops, &parsed_line.trops, 106 &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle, 107 &parsed_line.opcode); 108 if (!matched) { 109 return tensorflow::errors::InvalidArgument( 110 "Input did not match regexp. Input: ", line, 111 ", Regexp: ", regexp_pattern); 112 } 113 114 InsertOrDie(parsed_results, parsed_line.opcode, parsed_line); 115 116 return Status::OK(); 117 } 118 119 // Returns void so that we can ASSERT. 120 void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, 121 const Computation& computation, 122 const Shape& lhs_arg_shape, 123 const Shape& rhs_arg_shape) { 124 LocalService* service = ClientLibrary::GetXlaService(client->platform()); 125 Backend* backend = service->mutable_backend(); 126 se::StreamExecutor* executor = backend->default_stream_executor(); 127 DeviceMemoryAllocator* allocator = backend->memory_allocator(); 128 auto* transfer_manager = backend->transfer_manager(); 129 130 TF_ASSERT_OK_AND_ASSIGN( 131 std::unique_ptr<ScopedShapedBuffer> lhs_arg, 132 transfer_manager->AllocateScopedShapedBuffer( 133 lhs_arg_shape, allocator, backend->default_device_ordinal())); 134 TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( 135 executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg)); 136 137 TF_ASSERT_OK_AND_ASSIGN( 138 std::unique_ptr<ScopedShapedBuffer> rhs_arg, 139 transfer_manager->AllocateScopedShapedBuffer( 140 rhs_arg_shape, allocator, backend->default_device_ordinal())); 141 TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( 142 executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg)); 143 144 TF_ASSERT_OK_AND_ASSIGN( 145 std::unique_ptr<LocalExecutable> local_executable, 146 client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape}, 147 ExecutableBuildOptions())); 148 149 Executable* executable = local_executable->executable(); 150 HloExecutionProfile hlo_execution_profile( 151 &executable->hlo_profile_printer_data(), 152 &executable->hlo_profile_index_map()); 153 154 TF_ASSERT_OK_AND_ASSIGN( 155 Backend::StreamPtr stream_ptr, 156 backend->BorrowStream(backend->default_device_ordinal())); 157 ExecutableRunOptions exec_run_options; 158 exec_run_options.set_stream(stream_ptr.get()); 159 exec_run_options.set_allocator(backend->memory_allocator()); 160 exec_run_options.set_intra_op_thread_pool( 161 backend->eigen_intra_op_thread_pool_device()); 162 ServiceExecutableRunOptions run_options( 163 exec_run_options, /*borrow_stream=*/nullptr, 164 backend->eigen_intra_op_thread_pool()); 165 TF_ASSERT_OK_AND_ASSIGN( 166 auto execution_result, 167 executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()}, 168 &hlo_execution_profile)); 169 (void)execution_result; 170 171 *profile_output = 172 hlo_execution_profile.ToString(executor->GetDeviceDescription()); 173 174 XLA_VLOG_LINES(4, *profile_output); 175 } 176 177 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend. 178 XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) { 179 const int64 m = 256, k = 256, n = 256; 180 Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k}); 181 Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k}); 182 183 TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform, 184 PlatformUtil::GetDefaultPlatform()); 185 TF_ASSERT_OK_AND_ASSIGN(LocalClient * client, 186 ClientLibrary::GetOrCreateLocalClient(platform)); 187 188 ComputationBuilder builder(client, TestName()); 189 auto result = builder.Tanh(builder.Add( 190 builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"), 191 builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs"))); 192 193 TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); 194 195 string profile_output; 196 ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape, 197 rhs_shape); 198 199 std::vector<string> profile_output_lines = 200 tensorflow::str_util::Split(profile_output, '\n'); 201 202 gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines; 203 204 TF_ASSERT_OK(ParseOneProfileOutputLine( 205 profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines)); 206 207 TF_ASSERT_OK(ParseOneProfileOutputLine( 208 profile_output_lines[2], /*expect_hlo=*/true, &parsed_profile_lines)); 209 210 TF_ASSERT_OK(ParseOneProfileOutputLine( 211 profile_output_lines[3], /*expect_hlo=*/true, &parsed_profile_lines)); 212 213 TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile, 214 MaybeFind(parsed_profile_lines, "[total]")); 215 TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile, 216 MaybeFind(parsed_profile_lines, "add")); 217 TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile, 218 MaybeFind(parsed_profile_lines, "tanh")); 219 220 EXPECT_GT(total_profile.cycles, 0); 221 EXPECT_EQ(total_profile.cycles_percentage, "100.00%"); 222 223 EXPECT_TRUE(HasFlops(total_profile)); 224 EXPECT_TRUE(HasTrops(total_profile)); 225 226 EXPECT_GT(total_profile.cycles, dot_profile.cycles); 227 EXPECT_NE(dot_profile.cycles_percentage, "0.00%"); 228 EXPECT_NE(dot_profile.cycles_percentage, "100.00%"); 229 230 EXPECT_TRUE(HasFlops(dot_profile)); 231 EXPECT_FALSE(HasTrops(dot_profile)); 232 233 EXPECT_GT(total_profile.cycles, tanh_profile.cycles); 234 EXPECT_NE(tanh_profile.cycles_percentage, "0.00%"); 235 EXPECT_NE(tanh_profile.cycles_percentage, "100.00%"); 236 237 EXPECT_FALSE(HasFlops(tanh_profile)); 238 EXPECT_TRUE(HasTrops(tanh_profile)); 239 } 240 241 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend. 242 // 243 // TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo 244 // instructions "interior" to while nodes. 245 XLA_TEST_F(HloProfileTest, 246 DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) { 247 const int64 size = 256; 248 Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size}); 249 Shape while_result_shape = 250 ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), matrix_shape}); 251 252 TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform, 253 PlatformUtil::GetDefaultPlatform()); 254 TF_ASSERT_OK_AND_ASSIGN(LocalClient * client, 255 ClientLibrary::GetOrCreateLocalClient(platform)); 256 257 Computation condition; 258 { 259 ComputationBuilder builder(client, "condition"); 260 auto state = builder.Parameter(0, while_result_shape, "state"); 261 auto iteration = builder.GetTupleElement(state, 0); 262 builder.Gt(builder.ConstantR0<int32>(5), iteration); 263 TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); 264 } 265 266 Computation body; 267 { 268 ComputationBuilder builder(client, "body"); 269 auto state = builder.Parameter(0, while_result_shape, "state"); 270 auto matrix = builder.GetTupleElement(state, 1); 271 auto next_iteration = builder.Add(builder.GetTupleElement(state, 0), 272 builder.ConstantR0<int32>(1)); 273 builder.Tuple({next_iteration, builder.Add(matrix, matrix)}); 274 TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); 275 } 276 277 ComputationBuilder builder(client, TestName()); 278 auto initial_while_state = 279 builder.Tuple({builder.ConstantR0<int32>(0), 280 builder.Parameter(0, matrix_shape, "initial_value")}); 281 auto while_result = builder.While(condition, body, initial_while_state); 282 builder.Add(builder.GetTupleElement(while_result, 1), 283 builder.Parameter(1, matrix_shape, "other_value")); 284 285 TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); 286 287 string profile_output; 288 ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape, 289 matrix_shape); 290 291 std::vector<string> profile_output_lines = 292 tensorflow::str_util::Split(profile_output, '\n'); 293 294 auto while_body_profile_start = 295 std::find_if(profile_output_lines.begin(), profile_output_lines.end(), 296 [](tensorflow::StringPiece s) { 297 return s.starts_with("Execution profile for body"); 298 }); 299 300 ASSERT_NE(while_body_profile_start, profile_output_lines.end()); 301 302 gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines; 303 304 TF_ASSERT_OK( 305 ParseOneProfileOutputLine(*std::next(while_body_profile_start, 1), 306 /*expect_hlo=*/false, &parsed_profile_lines)); 307 308 TF_ASSERT_OK( 309 ParseOneProfileOutputLine(*std::next(while_body_profile_start, 2), 310 /*expect_hlo=*/true, &parsed_profile_lines)); 311 312 TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile, 313 MaybeFind(parsed_profile_lines, "[total]")); 314 TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile, 315 MaybeFind(parsed_profile_lines, "add")); 316 317 EXPECT_GT(total_while_body_profile.cycles, 0); 318 EXPECT_EQ(total_while_body_profile.opcode, "[total]"); 319 EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%"); 320 321 EXPECT_GT(total_while_body_profile.cycles, dot_profile.cycles); 322 EXPECT_NE(dot_profile.cycles_percentage, "0.00%"); 323 EXPECT_NE(dot_profile.cycles_percentage, "100.00%"); 324 } 325 } // namespace 326 } // namespace xla 327 328 static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) { 329 // Intentional "leak". 330 char** new_argv = new char*[argc + 2]; 331 for (int i = 0; i < argc; i++) { 332 new_argv[i] = argv[i]; 333 } 334 335 // We do it this way (as opposed to piping in a modified DebugOptions 336 // instance) for better end-to-end integration testing. 337 new_argv[argc] = strdup("--xla_hlo_profile"); 338 339 // Fusion can change the Hlo instructions that show up in the final Hlo 340 // executable, so block it here. 341 new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion"); 342 return {argc + 2, new_argv}; 343 } 344 345 GTEST_API_ int main(int argc, char** argv) { 346 std::vector<tensorflow::Flag> flag_list; 347 xla::legacy_flags::AppendDebugOptionsFlags(&flag_list); 348 std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv); 349 350 auto usage = tensorflow::Flags::Usage(argv[0], flag_list); 351 if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) { 352 LOG(ERROR) << "\n" << usage; 353 return 2; 354 } 355 356 testing::InitGoogleTest(&argc, argv); 357 if (argc > 1) { 358 LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage; 359 return 2; 360 } 361 return RUN_ALL_TESTS(); 362 } 363