1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 vcyou may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h" 17 18 #include "tensorflow/core/framework/tensor_shape.pb.h" 19 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h" 20 #include "tensorflow/core/kernels/hexagon/soc_interface.h" 21 #include "tensorflow/core/platform/profile_utils/cpu_utils.h" 22 23 namespace tensorflow { 24 25 constexpr const char* const OUTPUT_OP_NAME = "OUTPUT"; 26 constexpr const char* const REMOTE_FUSED_GRAPH_NODE_NAME_PREFIX = 27 "hexagon_remote_fused_graph"; 28 /* static */ constexpr const char* const 29 HexagonControlWrapper::REMOTE_FUSED_GRAPH_EXECUTOR_NAME; 30 31 constexpr int ALIGNMENT_BYTES = 16; 32 constexpr int MAX_IN_OUT_COUNT = 128; 33 34 const bool DBG_DUMP_VERIFICATION_STRING = false; 35 const int DBG_LEVEL = 0; // -2: verbose, -1: debug, 0: info 36 const bool DBG_USE_DUMMY_INPUT = false; 37 const bool DBG_USE_SAMPLE_INPUT = false; 38 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01; 39 const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false; 40 41 static string AddPort(const string& node_name) { 42 if (node_name.find(':') != string::npos) { 43 return node_name; 44 } else { 45 return strings::StrCat(node_name, ":", 0); 46 } 47 } 48 49 static uint8* FindAlignedPointer(uint8* ptr) { 50 const uintptr_t data_ptr_int = reinterpret_cast<uintptr_t>(ptr); 51 const int shift_count = 52 (ALIGNMENT_BYTES - data_ptr_int % ALIGNMENT_BYTES) % ALIGNMENT_BYTES; 53 uint8* data_ptr = ptr + shift_count; 54 return data_ptr; 55 } 56 57 /* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo( 58 const string& name, GraphTransferInfo* graph_transfer_info) { 59 for (GraphTransferInfo::NodeInfo& node_info : 60 *graph_transfer_info->mutable_node_info()) { 61 if (node_info.name() == name) { 62 return &node_info; 63 } 64 } 65 return nullptr; 66 } 67 68 int HexagonControlWrapper::GetVersion() { 69 return soc_interface_GetSocControllerVersion(); 70 } 71 72 bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo& info) { 73 soc_interface_SetLogLevel(DBG_LEVEL); 74 if (DBG_USE_SAMPLE_INPUT) { 75 soc_interface_SetDebugFlag(FLAG_ENABLE_PANDA_BINARY_INPUT); 76 } 77 if (info.serialized_executor_parameters().empty()) { 78 std::vector<std::pair<string, Tensor>> inputs; 79 std::vector<string> outputs; 80 RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto( 81 info, &inputs, &outputs); 82 Status status = graph_transferer_.LoadGraphFromProto( 83 HexagonOpsDefinitions::getInstance(), info.remote_graph(), inputs, 84 outputs, 85 false // shape_inference_for_unknown_shape 86 ); 87 TF_CHECK_OK(status) << status; 88 } else { 89 // If graph transfer info is attached, just import it. 90 graph_transferer_.SetSerializedGraphTransferInfo( 91 info.serialized_executor_parameters()); 92 } 93 execute_info_ = &info; 94 bool success = soc_interface_Init(); 95 if (!success) { 96 LOG(ERROR) << "Hexagon initialization was failed. See log output."; 97 return false; 98 } 99 std::vector<int> input_sizes; 100 std::vector<int> output_sizes; 101 CHECK_NOTNULL(execute_info_); 102 for (int i = 0; i < execute_info_->graph_input_node_name_size(); ++i) { 103 const string& input = execute_info_->graph_input_node_name(i); 104 LOG(INFO) << "Add input: " << input << ", " << i; 105 CHECK(input_port_map_.emplace(AddPort(input), i).second); 106 const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type = 107 execute_info_->default_graph_input_tensor_shape(i); 108 int64 buf_size = DataTypeSize(shape_type.dtype()); 109 for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) { 110 buf_size *= dim.size(); 111 } 112 input_sizes.emplace_back(static_cast<int>(buf_size)); 113 } 114 for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) { 115 const string& output = execute_info_->graph_output_node_name(i); 116 CHECK(output_port_map_.emplace(AddPort(output), i).second); 117 const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type = 118 execute_info_->default_graph_output_tensor_shape(i); 119 120 int64 buf_size = DataTypeSize(shape_type.dtype()); 121 for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) { 122 buf_size *= dim.size(); 123 } 124 output_sizes.emplace_back(static_cast<int>(buf_size)); 125 } 126 127 LOG(INFO) << "Allocate inout buffer"; 128 success &= soc_interface_AllocateInOutNodeBuffers( 129 input_sizes.size(), input_sizes.data(), output_sizes.size(), 130 output_sizes.data()); 131 return success; 132 } 133 134 bool HexagonControlWrapper::Finalize() { return soc_interface_Finalize(); } 135 bool HexagonControlWrapper::SetupGraph() { 136 // Copy graph transfer info to modify to adapt hexnn library 137 GraphTransferInfo& graph_transfer_info = 138 graph_transferer_.GetMutableGraphTransferInfo(); 139 140 // Overwrite op type of input nodes for hexagon 141 for (const GraphTransferInfo::GraphInputNodeInfo& graph_input : 142 graph_transfer_info.graph_input_node_info()) { 143 GraphTransferInfo::NodeInfo* node_info = 144 FindNodeInfo(graph_input.name(), &graph_transfer_info); 145 CHECK_NE(node_info, nullptr); 146 } 147 148 // Generate a new output node which is connected to graph output node 149 // TODO(satok): Support multiple output nodes 150 CHECK_EQ(graph_transfer_info.graph_output_node_info_size(), 1); 151 for (const GraphTransferInfo::GraphOutputNodeInfo& graph_output : 152 graph_transfer_info.graph_output_node_info()) { 153 const int new_output_node_id = graph_transfer_info.node_info_size() + 154 graph_transfer_info.const_node_info_size() + 155 2 /* offset for ids */; 156 // Register a new output node 157 GraphTransferInfo::NodeInfo& new_output_node_info = 158 *graph_transfer_info.add_node_info(); 159 new_output_node_info.set_name(OUTPUT_OP_NAME); 160 new_output_node_info.set_node_id(new_output_node_id); 161 new_output_node_info.set_type_name(OUTPUT_OP_NAME); 162 new_output_node_info.set_soc_op_id( 163 HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME, {})); 164 new_output_node_info.set_padding_id(0 /* PADDING_NA_ID */); 165 new_output_node_info.set_input_count(1); 166 new_output_node_info.set_output_count(0); 167 168 const TensorId tid = ParseTensorName(graph_output.name()); 169 const string node_name = tid.first.ToString(); 170 const int port = tid.second; 171 // Register node input for the new output node 172 const GraphTransferInfo::NodeInfo* node_info = 173 FindNodeInfo(node_name, &graph_transfer_info); 174 CHECK_NE(node_info, nullptr); 175 GraphTransferInfo::NodeInputInfo& node_input_info = 176 *graph_transfer_info.add_node_input_info(); 177 node_input_info.set_node_id(new_output_node_id); 178 GraphTransferInfo::NodeInput& node_input = 179 *node_input_info.add_node_input(); 180 node_input.set_node_id(node_info->node_id()); 181 node_input.set_output_port(port); 182 } 183 184 if (DBG_DUMP_VERIFICATION_STRING) { 185 GraphTransferer gt; 186 gt.SetSerializedGraphTransferInfo(graph_transfer_info.SerializeAsString()); 187 gt.DumpVerificationStringOfNodeTransferParams(); 188 } 189 190 int inputs_count = 0; 191 int outputs_count = 0; 192 for (const GraphTransferInfo::NodeInputInfo& input_params : 193 graph_transfer_info.node_input_info()) { 194 inputs_count += input_params.node_input_size(); 195 } 196 197 for (const GraphTransferInfo::NodeOutputInfo& output_params : 198 graph_transfer_info.node_output_info()) { 199 outputs_count += output_params.max_byte_size_size(); 200 } 201 // Allocate memory for node inputs and node outputs 202 soc_interface_AllocateNodeInputAndNodeOutputArray(inputs_count, 203 outputs_count); 204 205 // Construct node input parameters 206 std::unordered_map<int, std::tuple<void*, int>> inputs_map; 207 for (const GraphTransferInfo::NodeInputInfo& input_params : 208 graph_transfer_info.node_input_info()) { 209 const int count = input_params.node_input_size(); 210 CHECK(count <= MAX_IN_OUT_COUNT); 211 int node_ids[MAX_IN_OUT_COUNT]; 212 int ports[MAX_IN_OUT_COUNT]; 213 for (int i = 0; i < count; ++i) { 214 const GraphTransferInfo::NodeInput& node_input = 215 input_params.node_input(i); 216 node_ids[i] = node_input.node_id() + NODE_ID_OFFSET; 217 ports[i] = node_input.output_port(); 218 } 219 void* inputs_ptr = soc_interface_SetOneNodeInputs(count, node_ids, ports); 220 const int node_id = input_params.node_id(); 221 CHECK(inputs_map.count(node_id) == 0); 222 inputs_map.emplace(node_id, std::make_tuple(inputs_ptr, count)); 223 } 224 225 // Construct node output parameters 226 std::unordered_map<int, std::tuple<void*, int>> outputs_map; 227 for (const GraphTransferInfo::NodeOutputInfo& output_params : 228 graph_transfer_info.node_output_info()) { 229 const int count = output_params.max_byte_size_size(); 230 CHECK(count <= MAX_IN_OUT_COUNT); 231 int sizes[MAX_IN_OUT_COUNT]; 232 for (int i = 0; i < count; ++i) { 233 const int size = output_params.max_byte_size(i); 234 sizes[i] = size; 235 } 236 void* outputs_ptr = soc_interface_SetOneNodeOutputs(count, sizes); 237 const int node_id = output_params.node_id(); 238 CHECK(outputs_map.count(node_id) == 0); 239 outputs_map.emplace(node_id, std::make_tuple(outputs_ptr, count)); 240 } 241 242 // Instantiate graph 243 soc_interface_InstantiateGraph(); 244 245 // Initialize graph 246 // 1. Setup const nodes 247 for (const GraphTransferInfo::ConstNodeInfo& params : 248 graph_transfer_info.const_node_info()) { 249 const int node_id = params.node_id(); 250 // TODO(satok): Stop assuming shape size is 4. 251 CHECK(params.shape_size() == 4); 252 const int64 shape_0 = params.shape(0); 253 const int64 shape_1 = params.shape(1); 254 const int64 shape_2 = params.shape(2); 255 const int64 shape_3 = params.shape(3); 256 const int data_size = params.data().length(); 257 CHECK(dummy_const_data_.count(node_id) == 0); 258 auto data = dummy_const_data_.emplace( 259 std::piecewise_construct, std::make_tuple(node_id), std::make_tuple()); 260 CHECK(data.second); 261 data.first->second.resize(data_size + ALIGNMENT_BYTES - 1); 262 uint8* data_ptr = FindAlignedPointer(data.first->second.data()); 263 std::memcpy(data_ptr, params.data().data(), data_size); 264 soc_interface_AppendConstNode(params.name().c_str(), 265 node_id + NODE_ID_OFFSET, shape_0, shape_1, 266 shape_2, shape_3, data_ptr, data_size); 267 } 268 269 // 2. Setup op nodes 270 for (const GraphTransferInfo::NodeInfo& params : 271 graph_transfer_info.node_info()) { 272 const int node_id = params.node_id(); 273 const int op_id = params.soc_op_id(); 274 CHECK(inputs_map.count(node_id) == 1); 275 CHECK(outputs_map.count(node_id) <= 1); 276 // Only output node doesn't have output 277 const bool has_output = outputs_map.count(node_id) == 1; 278 const auto& input_ptr_and_count = inputs_map.at(node_id); 279 const void* input_ptr = std::get<0>(input_ptr_and_count); 280 const int input_count = std::get<1>(input_ptr_and_count); 281 void* output_ptr = nullptr; 282 int output_count = 0; 283 if (has_output) { 284 const auto& output_ptr_and_count = outputs_map.at(node_id); 285 output_ptr = std::get<0>(output_ptr_and_count); 286 output_count = std::get<1>(output_ptr_and_count); 287 // CHECK(output_count > 0); 288 } 289 int padding_id = -1; 290 if (params.padding_id() == 0) { 291 padding_id = 0; 292 } else if (params.padding_id() == Padding::SAME) { 293 padding_id = 1; 294 } else if (params.padding_id() == Padding::VALID) { 295 padding_id = 2; 296 } else { 297 LOG(FATAL); 298 } 299 soc_interface_AppendNode(params.name().c_str(), node_id + NODE_ID_OFFSET, 300 op_id, padding_id, input_ptr, input_count, 301 output_ptr, output_count); 302 } 303 304 LOG(INFO) << "Setup graph completed"; 305 306 // 3. construct graph 307 return soc_interface_ConstructGraph(); 308 309 // Keep following comment to use dummy graph construction 310 // return soc_interface_setupDummyGraph(3 /* inception version */); 311 } 312 313 bool HexagonControlWrapper::ExecuteGraph() { 314 return soc_interface_ExecuteGraph(); 315 } 316 317 bool HexagonControlWrapper::TeardownGraph() { 318 soc_interface_ReleaseNodeInputAndNodeOutputArray(); 319 return soc_interface_TeardownGraph(); 320 } 321 322 bool HexagonControlWrapper::FillInputNode( 323 const string& node_name, 324 const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape, 325 const ConstByteArray bytes) { 326 const string tensor_name = AddPort(node_name); 327 CHECK(input_port_map_.count(tensor_name) > 0); 328 const int port = input_port_map_.at(tensor_name); 329 if (input_tensor_data_.count(port) <= 0) { 330 input_tensor_data_.emplace(port, std::vector<uint8>{}); 331 } 332 std::vector<uint8>& input_tensor_data = input_tensor_data_.at(port); 333 334 // hexagon only supports 32bit dimension 335 const int x = static_cast<int>(shape[0]); 336 const int y = static_cast<int>(shape[1]); 337 const int z = static_cast<int>(shape[2]); 338 const int d = static_cast<int>(shape[3]); 339 340 const uint64 byte_size = x * y * z * d * DataTypeSize(std::get<2>(bytes)); 341 CHECK_EQ(byte_size, std::get<1>(bytes)); 342 input_tensor_data.resize(byte_size + ALIGNMENT_BYTES); 343 uint8* data_ptr = FindAlignedPointer(input_tensor_data.data()); 344 345 if (DBG_USE_DUMMY_INPUT) { 346 std::memset(data_ptr, 0, byte_size); 347 } else { 348 std::memcpy(data_ptr, std::get<0>(bytes), byte_size); 349 } 350 351 return soc_interface_FillInputNodeWithPort(port, x, y, z, d, data_ptr, 352 byte_size); 353 } 354 355 bool HexagonControlWrapper::ReadOutputNode( 356 const string& node_name, TensorAllocatorFunc tensor_allocator) { 357 CHECK_NE(execute_info_, nullptr); 358 TensorShape output_shape; 359 // TODO(satok): Switch shape corresponding to input shape 360 for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) { 361 if (execute_info_->graph_output_node_name(i) == node_name) { 362 for (const TensorShapeProto::Dim& dim : 363 execute_info_->default_graph_output_tensor_shape(i).shape().dim()) { 364 output_shape.AddDim(dim.size()); 365 } 366 break; 367 } 368 } 369 std::vector<ByteArray> outputs; 370 ReadOutputNode(node_name, &outputs); 371 CHECK_EQ(1, outputs.size()); 372 ByteArray& output = outputs[0]; 373 Tensor* output_tensor = tensor_allocator(output_shape); 374 CHECK(output_tensor->TotalBytes() >= std::get<1>(output)) 375 << output_tensor->TotalBytes() << ", " << std::get<1>(output); 376 TF_CHECK_OK(RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor( 377 std::get<0>(output), std::get<1>(output), output_tensor)); 378 return true; 379 } 380 381 bool HexagonControlWrapper::ReadOutputNode( 382 const string& node_name, std::vector<ByteArray>* const outputs) { 383 CHECK(outputs != nullptr); 384 ByteArray output; 385 const string tensor_name = AddPort(node_name); 386 CHECK(output_port_map_.count(tensor_name) > 0); 387 const int port = output_port_map_.at(tensor_name); 388 soc_interface_ReadOutputNodeWithPort( 389 port, &std::get<0>(output), 390 reinterpret_cast<uint64_t*>(&std::get<1>(output))); 391 // TODO: Accept all results 392 // std::get<2>(output) = DT_FLOAT; 393 outputs->emplace_back(output); 394 return true; 395 } 396 397 Status HexagonControlWrapper::FuseRemoteGraph( 398 const GraphDef& original_graph_def, const std::vector<string>& inputs, 399 const std::vector<string>& outputs, GraphDef* fused_graph_def) { 400 const std::unordered_set<string> fused_node_names = 401 RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions( 402 original_graph_def, HexagonOpsDefinitions::getInstance()); 403 // TODO(satok): We may want to place shape and type inside this function 404 // if they are not placed in the given graph. 405 TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames( 406 original_graph_def, inputs, outputs, REMOTE_FUSED_GRAPH_NODE_NAME_PREFIX, 407 fused_node_names, REMOTE_FUSED_GRAPH_EXECUTOR_NAME, 408 /*require_shape_type=*/true, fused_graph_def)); 409 return Status::OK(); 410 } 411 412 bool HexagonControlWrapper::FillInputNode(const string& node_name, 413 const Tensor& tensor) { 414 StringPiece tensor_data = tensor.tensor_data(); 415 const ConstByteArray ba = 416 ConstByteArray(reinterpret_cast<const uint8*>(tensor_data.data()), 417 tensor_data.size(), tensor.dtype()); 418 if (DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA) { 419 LOG(INFO) << "Input tensor data: element size = " << tensor.NumElements() 420 << ", byte syze = " << tensor.TotalBytes(); 421 std::stringstream line; 422 for (int i = 0; i < tensor.NumElements(); ++i) { 423 line << tensor.flat<float>().data()[i] << ", "; 424 if ((i - 2) % 3 == 0 || i == tensor.NumElements() - 1) { 425 LOG(INFO) << "(" << ((i - 2) / 3) << ") " << line.str(); 426 line.str(""); 427 line.clear(); 428 } 429 } 430 } 431 const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE> shape = 432 GraphTransferer::ToTensorShapeArray(tensor.shape()); 433 FillInputNode(node_name, shape, ba); 434 return true; 435 } 436 437 bool HexagonControlWrapper::IsEnabled() const { return true; }; 438 } // namespace tensorflow 439