1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define LOG_TAG "ExecutionPlan" 18 19 #include "ExecutionPlan.h" 20 21 #include "Callbacks.h" 22 #include "CompilationBuilder.h" 23 #include "ExecutionBuilder.h" 24 #include "Manager.h" 25 #include "ModelBuilder.h" 26 #include "Utils.h" 27 28 #include <functional> 29 #include <map> 30 #include <queue> 31 #include <unordered_set> 32 #include <utility> 33 #include <vector> 34 35 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback; 36 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback; 37 38 namespace android { 39 namespace nn { 40 41 static int compile(std::shared_ptr<Device> device, const ModelBuilder* model, 42 int32_t executionPreference, sp<IPreparedModel>* preparedModel) { 43 nnAssert(device != nullptr); // nullptr indicates CPU 44 // Compilation logic copied from ExecutionBuilder::startComputeOnDevice(). 45 Model hidlModel; 46 model->setHidlModel(&hidlModel); 47 48 sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback(); 49 Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel( 50 hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback); 51 if (!prepareLaunchStatus.isOk()) { 52 LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: " 53 << prepareLaunchStatus.description(); 54 return ANEURALNETWORKS_OP_FAILED; 55 } 56 if (prepareLaunchStatus != ErrorStatus::NONE) { 57 LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: " 58 << toString(static_cast<ErrorStatus>(prepareLaunchStatus)); 59 return ANEURALNETWORKS_OP_FAILED; 60 } 61 62 preparedModelCallback->wait(); 63 ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus(); 64 *preparedModel = preparedModelCallback->getPreparedModel(); 65 if (prepareReturnStatus != ErrorStatus::NONE || *preparedModel == nullptr) { 66 LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:" 67 << " prepareReturnStatus=" << toString(prepareReturnStatus) 68 << ", preparedModel=" << preparedModel->get(); 69 return ANEURALNETWORKS_OP_FAILED; 70 } 71 return ANEURALNETWORKS_NO_ERROR; 72 } 73 74 typedef std::function<void(uint32_t)> OperationReadyCallback; 75 76 // This class tracks whether we know the value of an operand as operations 77 // are processed. 78 class OperandTracker { 79 public: 80 // Creates the tracker for this model. Figure out which operations can be 81 // executed right away and cb for each one of them. 82 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb); 83 // Mark the specified operation as having been processed. The output 84 // of the operation now being known, this may make new operations to be 85 // able to run. Call cb for each one of them. 86 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb); 87 88 private: 89 const ModelBuilder* mModel; 90 std::multimap<uint32_t, uint32_t> mOperandToOperations; 91 std::vector<uint32_t> mUnknownInputCount; // For each operation 92 }; 93 94 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) : 95 mModel(model) { 96 const auto& operations = mModel->getOperations(); 97 mUnknownInputCount.resize(operations.size()); 98 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) { 99 const Operation& operation = operations[operationIndex]; 100 uint32_t count = 0; 101 for (uint32_t operandIndex : operation.inputs) { 102 auto lifetime = mModel->getOperand(operandIndex).lifetime; 103 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE || 104 lifetime == OperandLifeTime::MODEL_OUTPUT) { 105 count++; 106 mOperandToOperations.insert( 107 std::pair<uint32_t, uint32_t>(operandIndex, operationIndex)); 108 } 109 } 110 if (count == 0) { 111 cb(operationIndex); 112 } 113 mUnknownInputCount[operationIndex] = count; 114 } 115 } 116 117 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) { 118 // Mark all its outputs as known. 119 const Operation& operation = mModel->getOperations()[operationIndex]; 120 for (uint32_t operandIndex : operation.outputs) { 121 auto range = mOperandToOperations.equal_range(operandIndex); 122 for (auto i = range.first; i != range.second; i++) { 123 uint32_t& count = mUnknownInputCount[i->second]; 124 if (--count == 0) { 125 cb(i->second); 126 } 127 } 128 } 129 } 130 131 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, 132 std::shared_ptr<Device> device) 133 : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device) {} 134 135 // Adds an operand if it has not been added already. 136 // Sets the index in the submodel for the corresponding operand. 137 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex, 138 const ModelBuilder& fromModel, OperandKind kind) { 139 // Have we added this operand already? 140 auto i = mOperandMap.find(fromOperandIndex); 141 if (i != mOperandMap.end()) { 142 nnAssert(kind == INPUT); 143 *toOperandIndex = i->second; 144 return ANEURALNETWORKS_NO_ERROR; 145 } 146 147 // First time we add this operand. 148 *toOperandIndex = mSubModel.operandCount(); 149 mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex)); 150 151 // Add the operand to the submodel. 152 const Operand& operand = fromModel.getOperand(fromOperandIndex); 153 ANeuralNetworksOperandType type = { 154 .type = static_cast<int32_t>(operand.type), 155 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()), 156 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr, 157 .scale = operand.scale, 158 .zeroPoint = operand.zeroPoint 159 }; 160 int n = mSubModel.addOperand(type); 161 if (n != ANEURALNETWORKS_NO_ERROR) { 162 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 163 return n; 164 } 165 166 // Sets its value. 167 switch (operand.lifetime) { 168 case OperandLifeTime::CONSTANT_COPY: { 169 const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset); 170 n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length); 171 if (n != ANEURALNETWORKS_NO_ERROR) { 172 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 173 return n; 174 } 175 } break; 176 case OperandLifeTime::CONSTANT_REFERENCE: { 177 const Memory* memory = fromModel.getMemories()[operand.location.poolIndex]; 178 n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory, 179 operand.location.offset, 180 operand.location.length); 181 if (n != ANEURALNETWORKS_NO_ERROR) { 182 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 183 return n; 184 } 185 } break; 186 case OperandLifeTime::NO_VALUE: { 187 n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0); 188 if (n != ANEURALNETWORKS_NO_ERROR) { 189 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 190 return n; 191 } 192 } break; 193 case OperandLifeTime::TEMPORARY_VARIABLE: // handled similarly to MODEL_OUTPUT 194 if (kind == INPUT) { 195 // The first time we've seen this operand is as an 196 // input. That means it must be defined by a 197 // different partition, and is an input to this one. 198 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 199 } else { 200 // The first time we've seen this operand is as an 201 // output. It may be an input to a different 202 // partition, so keep track of it. 203 mPlan->recordTemporaryDef(fromOperandIndex, mIndex); 204 } 205 break; 206 case OperandLifeTime::MODEL_INPUT: 207 mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 208 break; 209 case OperandLifeTime::MODEL_OUTPUT: // handled similarly to TEMPORARY_VARIABLE 210 if (kind == INPUT) { 211 // The first time we've seen this operand is as an 212 // input. That means it must be defined by a 213 // different partition, and is an input to this one. 214 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 215 } else { 216 // The first time we've seen this operand is as an 217 // output. 218 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 219 } 220 break; 221 default: 222 nnAssert(false); 223 break; 224 } 225 226 return ANEURALNETWORKS_NO_ERROR; 227 } 228 229 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) { 230 const Operation& operation = fromModel.getOperation(operationIndex); 231 232 // Convert the input and output operand indexes. 233 // 234 // We expect operations to be added in topological order. Therefore: 235 // 236 // - We may not have seen an input if it is a model input, a 237 // constant, or an operand written by a different partition. 238 // 239 // - We should not have seen any outputs. 240 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size()); 241 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size()); 242 std::vector<uint32_t> inputs(inputCount); 243 std::vector<uint32_t> outputs(outputCount); 244 245 auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands, 246 std::vector<uint32_t>& localOperands, 247 OperandKind kind) -> int { 248 const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size()); 249 for (uint32_t i = 0; i < operandCount; i++) { 250 uint32_t localOperand = ~0U; 251 int n = addOperand(globalOperands[i], &localOperand, fromModel, kind); 252 if (n != ANEURALNETWORKS_NO_ERROR) 253 return n; 254 localOperands[i] = localOperand; 255 } 256 return ANEURALNETWORKS_NO_ERROR; 257 }; 258 259 int n; 260 if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR || 261 (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) { 262 return n; 263 } 264 265 return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(), 266 outputCount, outputs.data()); 267 } 268 269 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const { 270 for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) { 271 stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i); 272 } 273 for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) { 274 stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i); 275 } 276 } 277 278 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() { 279 for (const auto& step : mSteps) { 280 for (const auto& input : step->getTempsAsSubModelInputs()) { 281 const uint32_t fromModelIndex = input.first; 282 const auto it = mTemporaryToDefiningStep.find(fromModelIndex); 283 nnAssert(it != mTemporaryToDefiningStep.end()); 284 const uint32_t stepIndex = it->second; 285 nnAssert(stepIndex < mSteps.size()); 286 mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex); 287 } 288 } 289 } 290 291 void ExecutionStep::logSubModel() const { 292 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex; 293 294 auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) { 295 if (!toLog.empty()) { 296 toLog += ", "; 297 } 298 toLog += "("; 299 toLog += std::to_string(e.first); 300 toLog += "->"; 301 toLog += std::to_string(e.second); 302 toLog += ")"; 303 }; 304 305 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) { 306 std::string toLog; 307 for (const auto& e : map) { 308 logRemapEntry(toLog, e); 309 } 310 VLOG(COMPILATION) << name << ": " << toLog; 311 }; 312 auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) { 313 std::string toLog; 314 for (const auto& e : set) { 315 logRemapEntry(toLog, e); 316 } 317 VLOG(COMPILATION) << name << ": " << toLog; 318 }; 319 320 logRemapVector("model inputs", mModelInputs); 321 logRemapVector("model outputs", mModelOutputs); 322 logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs); 323 logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs); 324 logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs); 325 } 326 327 static void convertModelInputsOrOutputs( 328 // IN: mModel{Inputs|Outputs} 329 const ExecutionStep::RemapVectorType& myModelInputsOrOutputs, 330 // IN: fromModel->{input|output}Count() 331 uint32_t fromModelInputOrOutputCount, 332 // IN: fromModel->get{Input|Output}OperandIndex 333 std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex, 334 // OUT: for v : mModel{Inputs|Outputs} : v.second 335 std::vector<uint32_t>* inputsOrOutputs, 336 // OUT: submodel input-or-output index to original model input-or-output index 337 std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) { 338 std::map<uint32_t, uint32_t> fromModelIndexMap; // operand index to input-or-output index 339 for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) { 340 fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i; 341 } 342 for (const auto& myInputOrOutput : myModelInputsOrOutputs) { 343 inputsOrOutputs->push_back(myInputOrOutput.second); 344 const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first]; 345 inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex); 346 } 347 } 348 349 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize, 350 int32_t executionPreference) { 351 if (VLOG_IS_ON(COMPILATION)) { 352 logSubModel(); 353 } 354 355 mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16()); 356 357 // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs 358 // Output order: mModelOutputs, mTempsAsSubModelOutputs 359 // 360 // ExecutionPlan::next() depends on these orderings. 361 362 std::vector<uint32_t> inputs; 363 convertModelInputsOrOutputs(mModelInputs, 364 fromModel->inputCount(), 365 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); }, 366 &inputs, 367 &mInputIndexSubModelToFromModel); 368 for (const auto& subModelInput : mTempsAsSubModelInputs) { 369 inputs.push_back(subModelInput.second); 370 } 371 for (const auto& subModelInput : mOutputsAsSubModelInputs) { 372 inputs.push_back(subModelInput.second); 373 } 374 375 std::vector<uint32_t> outputs; 376 convertModelInputsOrOutputs(mModelOutputs, 377 fromModel->outputCount(), 378 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); }, 379 &outputs, 380 &mOutputIndexSubModelToFromModel); 381 for (const auto& subModelOutput : mTempsAsSubModelOutputs) { 382 outputs.push_back(subModelOutput.second); 383 const Operand& operand = mSubModel.getOperand(subModelOutput.second); 384 for (uint32_t dimension : operand.dimensions) { 385 if (dimension == 0) { 386 *hasOutputOfUnknownSize = true; 387 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first 388 << " of original graph) has unknown size: " 389 << toString(operand); 390 break; 391 } 392 } 393 } 394 395 { 396 int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]); 397 if (n != ANEURALNETWORKS_NO_ERROR) { 398 return n; 399 } 400 n = mSubModel.finish(); 401 if (n != ANEURALNETWORKS_NO_ERROR) { 402 return n; 403 } 404 } 405 406 { 407 // Compute mOutputsAsSubModelInputsIndexToFromModel. 408 409 std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex; 410 for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) { 411 fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i; 412 } 413 414 for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) { 415 const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first; 416 const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex); 417 if (it == fromModelOperandIndexToOutputIndex.end()) { 418 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex 419 << " in main model output operand list"; 420 return ANEURALNETWORKS_BAD_STATE; 421 } 422 mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second); 423 } 424 } 425 426 // TODO: Move compilation elsewhere? 427 428 if (mDevice == nullptr) { 429 return ANEURALNETWORKS_NO_ERROR; 430 } 431 432 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation"; 433 return compile(mDevice, &mSubModel, executionPreference, &mPreparedSubModel); 434 } 435 436 void ExecutionStep::dump() const { 437 Model model; 438 mSubModel.setHidlModel(&model); 439 if (VLOG_IS_ON(COMPILATION)) { 440 VLOG(COMPILATION) << "ExecutionStep#" << mIndex 441 << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName()); 442 logModelToInfo(model); 443 } 444 } 445 446 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel, 447 int32_t executionPreference) { 448 findTempsAsSubModelOutputs(); 449 for (const auto& step : mSteps) { 450 int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize, 451 executionPreference); 452 if (n != ANEURALNETWORKS_NO_ERROR) { 453 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed"; 454 return n; 455 } 456 } 457 if (mHasSubModelOutputOfUnknownSize) { 458 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize"; 459 return ANEURALNETWORKS_OP_FAILED; 460 } 461 462 mSuccessfulFinish = true; 463 return ANEURALNETWORKS_NO_ERROR; 464 } 465 466 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel, 467 int32_t executionPreference) { 468 if (mDevice == nullptr) { 469 mSuccessfulFinish = true; 470 return ANEURALNETWORKS_NO_ERROR; 471 } 472 473 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation"; 474 const int n = compile(mDevice, mModel, executionPreference, &mPreparedModel); 475 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR); 476 return n; 477 } 478 479 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) { 480 nnAssert(mBody != nullptr); 481 return mBody->finish(fromModel, executionPreference); 482 } 483 484 ExecutionPlan::Controller::Controller( 485 const ExecutionPlan* plan, 486 const ExecutionBuilder* executionBuilder, 487 std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs, 488 uint32_t totalSizeOfTemporaries) : 489 mPlan(plan), mExecutionBuilder(executionBuilder), 490 mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) { 491 if (totalSizeOfTemporaries) { 492 if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) { 493 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries"; 494 mNextStepIndex = kBadStepIndex; 495 } 496 } 497 } 498 499 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController( 500 const ExecutionBuilder* executionBuilder) const { 501 nnAssert((mState == EMPTY) == (mBody == nullptr)); 502 if (mBody && !mBody->mSuccessfulFinish) { 503 VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish"; 504 return std::shared_ptr<Controller>(nullptr); 505 } 506 507 // Create the layout for a Memory object big enough for to hold 508 // every TEMPORARY in the original model that is live across 509 // partition boundaries. 510 // 511 // TODO: Rethink this approach for managing temporaries. Some 512 // alternatives: 513 // 514 // 1) Adopt a memory layout scheme analogous to stack allocation, 515 // where objects of non-overlapping lifetime can occupy the same 516 // storage. We would still have a single Memory object in this 517 // case. 518 // 519 // 2) Do something like what CpuExecutor does, and do allocations 520 // and deallocations on the fly (during execution) before first 521 // reference and after last reference, respectively. This would 522 // mean having one Memory object per TEMPORARY; or, in a more 523 // complicated implementation, one Memory object per set of 524 // temporaries that have the same lifetime. Note that the Android 525 // system limits the number of shared memory objects, which are 526 // what our Memory objects represent. 527 // 528 uint32_t totalSizeOfTemporaries = 0; 529 std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs; 530 if (mState == COMPOUND) { 531 const ModelBuilder* fromModel = executionBuilder->getModel(); 532 for (const auto& step : compound()->mSteps) { 533 for (const auto& output: step->getTempsAsSubModelOutputs()) { 534 const uint32_t fromModelOperandIndex = output.first; 535 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex); 536 if (subModelInputsAndOutputs == nullptr) { 537 subModelInputsAndOutputs = 538 std::make_shared<Controller::SubModelInputsAndOutputsType>(); 539 } 540 const uint32_t size = sizeOfData(fromModelOperand); 541 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size); 542 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries)); 543 totalSizeOfTemporaries += size; 544 } 545 } 546 if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) { 547 for (const auto& io : *subModelInputsAndOutputs) { 548 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first 549 << ", offset = " << io.second; 550 } 551 } 552 } 553 554 return std::shared_ptr<Controller>(new Controller(this, executionBuilder, 555 subModelInputsAndOutputs, 556 totalSizeOfTemporaries)); 557 } 558 559 560 // TODO: Find a better way to provide this functionality. 561 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller, 562 std::shared_ptr<StepExecutor>* executor) const { 563 *executor = nullptr; 564 565 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor 566 << "): mNextStepIndex = " << controller->mNextStepIndex; 567 568 if (controller->mNextStepIndex == 0) { 569 // We haven't called next(). 570 return ANEURALNETWORKS_OP_FAILED; 571 } 572 573 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 574 // The last call to next() did not produce an executor. 575 return ANEURALNETWORKS_OP_FAILED; 576 } 577 578 --controller->mNextStepIndex; 579 return next(controller, executor); 580 } 581 582 int ExecutionPlan::next(std::shared_ptr<Controller> controller, 583 std::shared_ptr<StepExecutor>* executor) const { 584 *executor = nullptr; 585 586 VLOG(EXECUTION) << "ExecutionPlan::next(" 587 << SHOW_IF_DEBUG(controller << ", " << executor) 588 << "): mNextStepIndex = " << controller->mNextStepIndex; 589 590 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 591 return ANEURALNETWORKS_OP_FAILED; 592 } 593 594 if (mState == EMPTY) { 595 nnAssert(controller->mNextStepIndex == 0); // end 596 controller->mNextStepIndex = Controller::kBadStepIndex; 597 return ANEURALNETWORKS_NO_ERROR; 598 } 599 600 if (mState == SIMPLE) { 601 if (controller->mNextStepIndex == 0) { 602 // First (and only) step. 603 auto simpleBody = static_cast<const SimpleBody*>(mBody); 604 *executor = std::make_shared<StepExecutor>( 605 controller->mExecutionBuilder, 606 simpleBody->mModel, 607 (simpleBody->mDevice == nullptr ? nullptr : simpleBody->mDevice->getInterface()), 608 simpleBody->mPreparedModel); 609 (*executor)->mapInputsAndOutputsTrivially(); 610 controller->mNextStepIndex = 1; 611 return ANEURALNETWORKS_NO_ERROR; 612 } 613 614 nnAssert(controller->mNextStepIndex == 1); // end 615 controller->mNextStepIndex = Controller::kBadStepIndex; 616 return ANEURALNETWORKS_NO_ERROR; 617 } 618 619 auto compoundBody = compound(); 620 621 if (controller->mNextStepIndex == compoundBody->mSteps.size()) { 622 // end 623 controller->mNextStepIndex = Controller::kBadStepIndex; 624 return ANEURALNETWORKS_NO_ERROR; 625 } 626 627 // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs 628 // Output order: model outputs, temps as submodel outputs 629 // 630 // ExecutionStep::finishSubModel() establishes these orderings. 631 632 const auto step = compoundBody->mSteps[controller->mNextStepIndex]; 633 *executor = std::make_shared<StepExecutor>( 634 controller->mExecutionBuilder, 635 step->getSubModel(), 636 (step->getDevice() == nullptr ? nullptr : step->getDevice()->getInterface()), 637 step->getPreparedSubModel()); 638 step->mapInputsAndOutputs(*executor); 639 if (controller->mSubModelInputsAndOutputs != nullptr) { 640 { 641 // Tell executor about temps as submodel outputs. 642 643 const size_t firstSubModelOutputIndex = step->getModelOutputs().size(); 644 const auto& subModelOutputs = step->getTempsAsSubModelOutputs(); 645 646 uint32_t idx = 0; 647 for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) { 648 const uint32_t fromModelOperandIndex = I->first; 649 const uint32_t offsetOfTemporary = 650 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex); 651 int n = (*executor)->setOutputFromTemporaryMemory( 652 firstSubModelOutputIndex + idx, 653 &controller->mTemporaries, 654 offsetOfTemporary); 655 if (n != ANEURALNETWORKS_NO_ERROR) { 656 controller->mNextStepIndex = Controller::kBadStepIndex; 657 return n; 658 } 659 } 660 } 661 { 662 // Tell executor about temps as submodel inputs. 663 664 const size_t firstSubModelInputIndex = step->getModelInputs().size(); 665 const auto& subModelInputs = step->getTempsAsSubModelInputs(); 666 667 uint32_t idx = 0; 668 for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) { 669 const uint32_t fromModelOperandIndex = I->first; 670 const uint32_t offsetOfTemporary = 671 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex); 672 int n = (*executor)->setInputFromTemporaryMemory( 673 firstSubModelInputIndex + idx, 674 &controller->mTemporaries, 675 offsetOfTemporary); 676 if (n != ANEURALNETWORKS_NO_ERROR) { 677 controller->mNextStepIndex = Controller::kBadStepIndex; 678 return n; 679 } 680 } 681 } 682 } 683 { 684 // Tell executor about outputs as submodel inputs. 685 686 const size_t firstOutputsAsSubModelInputIndex = 687 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size(); 688 const auto& outputsAsSubModelInputsIndexToFromModel = 689 step->getOutputsAsSubModelInputsIndexToFromModel(); 690 for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) { 691 uint32_t o = outputsAsSubModelInputsIndexToFromModel[i]; 692 (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i); 693 } 694 } 695 696 controller->mNextStepIndex++; 697 return ANEURALNETWORKS_NO_ERROR; 698 } 699 700 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) { 701 nnAssert(mState != SIMPLE); 702 if (mState == EMPTY) { 703 mBody = new CompoundBody(); 704 mState = COMPOUND; 705 } 706 auto& steps = compound()->mSteps; 707 auto step = std::make_shared<ExecutionStep>(this, steps.size(), device); 708 steps.push_back(step); 709 return step; 710 } 711 712 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device, 713 const ModelBuilder* model) { 714 nnAssert(mState == EMPTY); 715 mBody = new SimpleBody(device, model); 716 mState = SIMPLE; 717 } 718 719 void ExecutionPlan::dump() const { 720 if (mBody) { 721 mBody->dump(); 722 } else { 723 VLOG(COMPILATION) << "EMPTY"; 724 } 725 } 726 727 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const { 728 switch (mState) { 729 case EMPTY: 730 return Kind::EMPTY; 731 case SIMPLE: 732 nnAssert(mBody); 733 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR; 734 case COMPOUND: 735 nnAssert(mBody); 736 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR; 737 default: 738 nnAssert(!"unexpected state"); 739 return Kind::ERROR; 740 } 741 } 742 743 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const { 744 nnAssert(mState == SIMPLE); 745 return static_cast<const SimpleBody*>(mBody)->mDevice; 746 } 747 748 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const { 749 return compound()->mSteps; 750 } 751 752 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const { 753 return mBody->hasSubModelOutputsOfUnknownSize(); 754 } 755 756 void ExecutionPlan::SimpleBody::dump() const { 757 VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName()); 758 } 759 760 void ExecutionPlan::CompoundBody::dump() const { 761 for (const auto& step : mSteps) { 762 step->dump(); 763 } 764 } 765 766 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, 767 uint32_t preference, ExecutionPlan* plan) const { 768 // This function uses a heuristic approach to partitioning the graph. 769 // It should be good enough for the first release. 770 771 const size_t nonCpuDeviceCount = devices.size(); 772 // The device count is the number of HAL devices + 1. The +1 is for the CPU. 773 // Note that deviceCount includes CPU, which has no entry in devices[]. 774 const size_t deviceCount = nonCpuDeviceCount + 1; 775 const size_t operationCount = mOperations.size(); 776 777 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount 778 << ", operationCount = " << operationCount; 779 780 // If we only have the CPU, or if the graph has no operations, no need to try to partition. 781 if (nonCpuDeviceCount == 0 || operationCount == 0) { 782 // Make sure no op is an OEM operation. 783 for (auto& op: mOperations) { 784 if (op.type == OperationType::OEM_OPERATION) { 785 LOG(ERROR) << "No driver can do the OEM op"; 786 return ANEURALNETWORKS_BAD_DATA; 787 } 788 } 789 plan->becomeSingleStep(nullptr /* CPU */, this); 790 return plan->finish(this, preference); 791 } 792 793 // Figure out where each operation will best execute. 794 // The value of the vector is the index in the devices vector, with devices.size() 795 // representing the CPU. 796 std::vector<int> bestDeviceForOperation(operationCount); 797 int status = findBestDeviceForEachOperation(preference, devices, deviceCount, 798 &bestDeviceForOperation); 799 if (status != ANEURALNETWORKS_NO_ERROR) { 800 return status; 801 } 802 803 // If one device will run all the operations, we don't need to split the work. 804 if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(), 805 std::not_equal_to<int>()) == bestDeviceForOperation.end()) { 806 const int bestDeviceIndex = bestDeviceForOperation[0]; 807 const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1); 808 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: " 809 << bestDeviceIndex << " = " 810 << (cpu ? "CPU" : devices[bestDeviceIndex]->getName()); 811 plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this); 812 return plan->finish(this, preference); 813 } 814 815 // No easy solution, we need to split the work. 816 817 // We keep track of the operations that are ready to run for each device. 818 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount); 819 820 // This helper function enqueues the operation on the appropriate queue. 821 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) { 822 int deviceIndex = bestDeviceForOperation[operationIndex]; 823 perDeviceQueue[deviceIndex].push(operationIndex); 824 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto " 825 << deviceIndex; 826 }; 827 828 // This helper function finds a device that has operations ready to process. 829 // We start by looking at the CPU. We do this to try to maximize the 830 // size of the graph we'll send to non-CPU devices. If the CPU runs first, 831 // it will have the chance to prepare more of the inputs required by the 832 // other devices. This function returns -1 if all queues are empty. 833 auto findNextDeviceToProcess = [&]() -> int { 834 for (int i = deviceCount - 1; i >= 0; i--) { 835 if (!perDeviceQueue[i].empty()) { 836 return i; 837 } 838 } 839 return -1; 840 }; 841 842 OperandTracker tracker(this, enqueueOnAppropriateDevice); 843 // For each iteration of this loop, we'll create an execution step. 844 while (true) { 845 // Find the device we'll do this step for. 846 int deviceIndex = findNextDeviceToProcess(); 847 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex; 848 if (deviceIndex < 0) { 849 break; 850 } 851 // nullptr represents the CPU. 852 std::shared_ptr<Device> device = 853 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount 854 ? devices[deviceIndex] : nullptr; 855 856 // Assign as much as possible to this device. 857 std::shared_ptr<ExecutionStep> step = plan->createNewStep(device); 858 auto& queue = perDeviceQueue[deviceIndex]; 859 while (!queue.empty()) { 860 uint32_t operationIndex = queue.front(); 861 queue.pop(); 862 int n = step->addOperation(operationIndex, *this); 863 if (n != ANEURALNETWORKS_NO_ERROR) { 864 LOG(ERROR) << "failed to add operation " << operationIndex << " to step"; 865 return n; 866 } 867 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); 868 } 869 } 870 871 int n = plan->finish(this, preference); 872 if (VLOG_IS_ON(COMPILATION)) { 873 Model model; 874 setHidlModel(&model); 875 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: "; 876 logModelToInfo(model); 877 plan->dump(); 878 } 879 return n; 880 } 881 882 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device, 883 uint32_t operationIndex) const { 884 const Operation& operation = getOperation(operationIndex); 885 // TODO This assumes that the type is dictated by the first operand. This is 886 // currently the case but is not a safe assumption to make in the long term. 887 const uint32_t operandIndex = operation.inputs[0]; 888 const OperandType operandType = mOperands[operandIndex].type; 889 switch(operandType) { 890 case OperandType::FLOAT32: 891 case OperandType::TENSOR_FLOAT32: 892 if (mRelaxComputationFloat32toFloat16) { 893 return device->getRelaxedFloat32toFloat16Performance(); 894 } else { 895 return device->getFloat32Performance(); 896 } 897 case OperandType::INT32: 898 case OperandType::UINT32: 899 case OperandType::TENSOR_INT32: 900 case OperandType::TENSOR_QUANT8_ASYMM: 901 // For OEM, the real selection will be made from who can run the operand. 902 case OperandType::OEM: 903 case OperandType::TENSOR_OEM_BYTE: 904 return device->getQuantized8Performance(); 905 default: 906 nnAssert(false); 907 return device->getQuantized8Performance(); 908 } 909 } 910 911 namespace { 912 // This class determines whether a given device can execute a given operation 913 class CanDo { 914 public: 915 CanDo() {} 916 917 void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) { 918 Model hidlModel; 919 model->setHidlModel(&hidlModel); 920 device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex); 921 } 922 923 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; } 924 925 private: 926 hidl_vec<bool> mSupportsOperationByIndex; 927 }; 928 }; // anonymous namespace 929 930 int ModelBuilder::findBestDeviceForEachOperation( 931 uint32_t preference, 932 const std::vector<std::shared_ptr<Device>>& devices, 933 const size_t deviceCount, 934 std::vector<int>* bestDeviceForOperation) const { 935 936 // Note that deviceCount includes CPU, which has no entry in devices[] 937 const size_t nonCpuDeviceCount = deviceCount - 1; 938 939 std::vector<CanDo> canDo(nonCpuDeviceCount); 940 for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) { 941 canDo[deviceIndex].initialize(this, devices[deviceIndex]); 942 } 943 944 // Figure out the best driver for each operation. 945 const size_t operationCount = mOperations.size(); 946 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { 947 // Find which non-CPU device gives the best performance for this operation. 948 int bestChoice = -1; 949 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0. 950 for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) { 951 const auto& device = devices[deviceIndex]; 952 if (canDo[deviceIndex].check(operationIndex)) { 953 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex); 954 const float perfVal = 955 (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage 956 : perf.execTime); 957 if (bestChoice < 0 || perfVal < bestPerfVal) { 958 bestChoice = deviceIndex; 959 bestPerfVal = perfVal; 960 } 961 } else { 962 // Somewhat noisy logging, but only place where the user of 963 // NNAPI can get feedback on why an operation was not run on a 964 // specific device. 965 // Logs O(operationCount * nonCpuDeviceCount) times, but 966 // typically nonCpuDeviceCount is very small. 967 VLOG(COMPILATION) << "Device " << device->getName() 968 << " can't do operation " 969 << toString(getOperation(operationIndex).type); 970 } 971 } 972 // If it's the OEM op, we'd better have a device able to do it. 973 if (mOperations[operationIndex].type == OperationType::OEM_OPERATION) { 974 if (bestChoice < 0) { 975 LOG(ERROR) << "No driver can do the OEM op"; 976 return ANEURALNETWORKS_BAD_DATA; 977 } 978 } else { 979 // If no driver has been found, or if the best driver is not better than the CPU, 980 // prefer the CPU. Since the performance is a ratio compared to the CPU performance, 981 // by definition the performance of the CPU is 1.0. 982 if (bestChoice < 0 || bestPerfVal >= 1.0) { 983 bestChoice = nonCpuDeviceCount; // The ID of the CPU. 984 } 985 } 986 987 (*bestDeviceForOperation)[operationIndex] = bestChoice; 988 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" 989 << toString(getOperation(operationIndex).type) 990 << ") = " 991 << (*bestDeviceForOperation)[operationIndex]; 992 } 993 return ANEURALNETWORKS_NO_ERROR; 994 } 995 996 } // namespace nn 997 } // namespace android 998