1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define LOG_TAG "ExecutionPlan" 18 19 #include "ExecutionPlan.h" 20 21 #include "Callbacks.h" 22 #include "CompilationBuilder.h" 23 #include "ExecutionBuilder.h" 24 #include "Manager.h" 25 #include "ModelBuilder.h" 26 #include "Utils.h" 27 28 #include <functional> 29 #include <map> 30 #include <queue> 31 #include <unordered_set> 32 #include <utility> 33 #include <vector> 34 35 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback; 36 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback; 37 38 namespace android { 39 namespace nn { 40 41 static int compile(std::shared_ptr<Device> device, 42 const ModelBuilder* model, 43 sp<IPreparedModel>* preparedModel) { 44 nnAssert(device != nullptr); // nullptr indicates CPU 45 // Compilation logic copied from ExecutionBuilder::startComputeOnDevice(). 46 Model hidlModel; 47 model->setHidlModel(&hidlModel); 48 49 sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback(); 50 Return<ErrorStatus> prepareLaunchStatus = 51 device->getInterface()->prepareModel(hidlModel, preparedModelCallback); 52 if (!prepareLaunchStatus.isOk()) { 53 LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: " 54 << prepareLaunchStatus.description(); 55 return ANEURALNETWORKS_OP_FAILED; 56 } 57 if (prepareLaunchStatus != ErrorStatus::NONE) { 58 LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: " 59 << toString(static_cast<ErrorStatus>(prepareLaunchStatus)); 60 return ANEURALNETWORKS_OP_FAILED; 61 } 62 63 preparedModelCallback->wait(); 64 ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus(); 65 *preparedModel = preparedModelCallback->getPreparedModel(); 66 if (prepareReturnStatus != ErrorStatus::NONE || preparedModel == nullptr) { 67 LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:" 68 << " prepareReturnStatus=" << toString(prepareReturnStatus) 69 << ", preparedModel=" << preparedModel->get(); 70 return ANEURALNETWORKS_OP_FAILED; 71 } 72 return ANEURALNETWORKS_NO_ERROR; 73 } 74 75 typedef std::function<void(uint32_t)> OperationReadyCallback; 76 77 // This class tracks whether we know the value of an operand as operations 78 // are processed. 79 class OperandTracker { 80 public: 81 // Creates the tracker for this model. Figure out which operations can be 82 // executed right away and cb for each one of them. 83 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb); 84 // Mark the specified operation as having been processed. The output 85 // of the operation now being known, this may make new operations to be 86 // able to run. Call cb for each one of them. 87 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb); 88 89 private: 90 const ModelBuilder* mModel; 91 std::multimap<uint32_t, uint32_t> mOperandToOperations; 92 std::vector<uint32_t> mUnknownInputCount; // For each operation 93 }; 94 95 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) : 96 mModel(model) { 97 const auto& operations = mModel->getOperations(); 98 mUnknownInputCount.resize(operations.size()); 99 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) { 100 const Operation& operation = operations[operationIndex]; 101 uint32_t count = 0; 102 for (uint32_t operandIndex : operation.inputs) { 103 auto lifetime = mModel->getOperand(operandIndex).lifetime; 104 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE || 105 lifetime == OperandLifeTime::MODEL_OUTPUT) { 106 count++; 107 mOperandToOperations.insert( 108 std::pair<uint32_t, uint32_t>(operandIndex, operationIndex)); 109 } 110 } 111 if (count == 0) { 112 cb(operationIndex); 113 } 114 mUnknownInputCount[operationIndex] = count; 115 } 116 } 117 118 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) { 119 // Mark all its outputs as known. 120 const Operation& operation = mModel->getOperations()[operationIndex]; 121 for (uint32_t operandIndex : operation.outputs) { 122 auto range = mOperandToOperations.equal_range(operandIndex); 123 for (auto i = range.first; i != range.second; i++) { 124 uint32_t& count = mUnknownInputCount[i->second]; 125 if (--count == 0) { 126 cb(i->second); 127 } 128 } 129 } 130 } 131 132 ExecutionStep::ExecutionStep(ExecutionPlan* plan, 133 uint32_t stepIndex, 134 std::shared_ptr<ModelBuilder> model, 135 std::shared_ptr<Device> device) 136 : mPlan(plan), mIndex(stepIndex), mSubModel(model), mDevice(device) {} 137 138 // Adds an operand if it has not been added already. 139 // Sets the index in the submodel for the corresponding operand. 140 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex, 141 const ModelBuilder& fromModel, OperandKind kind) { 142 // Have we added this operand already? 143 auto i = mOperandMap.find(fromOperandIndex); 144 if (i != mOperandMap.end()) { 145 nnAssert(kind == INPUT); 146 *toOperandIndex = i->second; 147 return ANEURALNETWORKS_NO_ERROR; 148 } 149 150 // First time we add this operand. 151 *toOperandIndex = mSubModel->operandCount(); 152 mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex)); 153 154 // Add the operand to the submodel. 155 const Operand& operand = fromModel.getOperand(fromOperandIndex); 156 ANeuralNetworksOperandType type = {.type = static_cast<int32_t>(operand.type), 157 .dimensionCount = 158 static_cast<uint32_t>(operand.dimensions.size()), 159 .dimensions = operand.dimensions.data(), 160 .scale = operand.scale, 161 .zeroPoint = operand.zeroPoint}; 162 int n = mSubModel->addOperand(type); 163 if (n != ANEURALNETWORKS_NO_ERROR) { 164 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 165 return n; 166 } 167 168 // Sets its value. 169 switch (operand.lifetime) { 170 case OperandLifeTime::CONSTANT_COPY: { 171 const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset); 172 n = mSubModel->setOperandValue(*toOperandIndex, data, operand.location.length); 173 if (n != ANEURALNETWORKS_NO_ERROR) { 174 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 175 return n; 176 } 177 } break; 178 case OperandLifeTime::CONSTANT_REFERENCE: { 179 const Memory* memory = fromModel.getMemories()[operand.location.poolIndex]; 180 n = mSubModel->setOperandValueFromMemory(*toOperandIndex, memory, 181 operand.location.offset, 182 operand.location.length); 183 if (n != ANEURALNETWORKS_NO_ERROR) { 184 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 185 return n; 186 } 187 } break; 188 case OperandLifeTime::NO_VALUE: { 189 n = mSubModel->setOperandValue(*toOperandIndex, nullptr, 0); 190 if (n != ANEURALNETWORKS_NO_ERROR) { 191 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 192 return n; 193 } 194 } break; 195 case OperandLifeTime::TEMPORARY_VARIABLE: 196 if (kind == INPUT) { 197 // The first time we've seen this operand is as an 198 // input. That means it must be defined by a 199 // different partition, and is an input to this one. 200 mSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 201 } else { 202 // The first time we've seen this operand is as an 203 // output. It may be an input to a different 204 // partition, so keep track of it. 205 mPlan->recordTemporaryDef(fromOperandIndex, mIndex); 206 } 207 break; 208 case OperandLifeTime::MODEL_INPUT: 209 mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 210 break; 211 case OperandLifeTime::MODEL_OUTPUT: 212 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); 213 break; 214 default: 215 nnAssert(false); 216 break; 217 } 218 219 return ANEURALNETWORKS_NO_ERROR; 220 } 221 222 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) { 223 const Operation& operation = fromModel.getOperation(operationIndex); 224 225 // Convert the input and output operand indexes. 226 // 227 // We expect operations to be added in topological order. Therefore: 228 // 229 // - We may not have seen an input if it is a model input, a 230 // constant, or an operand written by a different partition. 231 // 232 // - We should not have seen any outputs. 233 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size()); 234 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size()); 235 std::vector<uint32_t> inputs(inputCount); 236 std::vector<uint32_t> outputs(outputCount); 237 238 auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands, 239 std::vector<uint32_t>& localOperands, 240 OperandKind kind) -> int { 241 const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size()); 242 for (uint32_t i = 0; i < operandCount; i++) { 243 uint32_t localOperand = ~0U; 244 int n = addOperand(globalOperands[i], &localOperand, fromModel, kind); 245 if (n != ANEURALNETWORKS_NO_ERROR) 246 return n; 247 localOperands[i] = localOperand; 248 } 249 return ANEURALNETWORKS_NO_ERROR; 250 }; 251 252 int n; 253 if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR || 254 (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) { 255 return n; 256 } 257 258 return mSubModel->addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(), 259 outputCount, outputs.data()); 260 } 261 262 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const { 263 for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) { 264 stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i); 265 } 266 for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) { 267 stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i); 268 } 269 } 270 271 void ExecutionPlan::CompoundBody::findSubModelOutputs() { 272 for (const auto& step : mSteps) { 273 for (const auto& input : step->getSubModelInputs()) { 274 const uint32_t fromModelIndex = input.first; 275 const auto it = mTemporaryToDefiningStep.find(fromModelIndex); 276 nnAssert(it != mTemporaryToDefiningStep.end()); 277 const uint32_t stepIndex = it->second; 278 nnAssert(stepIndex < mSteps.size()); 279 mSteps[stepIndex]->recordSubModelOutput(fromModelIndex); 280 } 281 } 282 } 283 284 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize) { 285 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex; 286 287 auto convertModelInputsOrOutputs = []( 288 // IN: mModel{Inputs|Outputs} 289 const RemapVectorType& myModelInputsOrOutputs, 290 // IN: fromModel->{input|output}Count() 291 uint32_t fromModelInputOrOutputCount, 292 // IN: fromModel->get{Input|Output}OperandIndex 293 std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex, 294 // OUT: for v : mModel{Inputs|Outputs} : v.second 295 std::vector<uint32_t>* inputsOrOutputs, 296 // OUT: submodel input-or-output index to original model input-or-output index 297 std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) { 298 std::map<uint32_t, uint32_t> fromModelIndexMap; // operand index to input-or-output index 299 for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) { 300 fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i; 301 } 302 for (const auto& myInputOrOutput : myModelInputsOrOutputs) { 303 inputsOrOutputs->push_back(myInputOrOutput.second); 304 const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first]; 305 inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex); 306 } 307 }; 308 309 std::vector<uint32_t> inputs; 310 convertModelInputsOrOutputs(mModelInputs, 311 fromModel->inputCount(), 312 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); }, 313 &inputs, 314 &mInputIndexSubModelToFromModel); 315 for (const auto& subModelInput : mSubModelInputs) { 316 inputs.push_back(subModelInput.second); 317 } 318 319 std::vector<uint32_t> outputs; 320 convertModelInputsOrOutputs(mModelOutputs, 321 fromModel->outputCount(), 322 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); }, 323 &outputs, 324 &mOutputIndexSubModelToFromModel); 325 for (const auto& subModelOutput : mSubModelOutputs) { 326 outputs.push_back(subModelOutput.second); 327 const Operand& operand = mSubModel->getOperand(subModelOutput.second); 328 for (uint32_t dimension : operand.dimensions) { 329 if (dimension == 0) { 330 *hasOutputOfUnknownSize = true; 331 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first 332 << " of original graph) has unknown size: " 333 << toString(operand); 334 break; 335 } 336 } 337 } 338 339 { 340 int n = mSubModel->identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]); 341 if (n != ANEURALNETWORKS_NO_ERROR) { 342 return n; 343 } 344 n = mSubModel->finish(); 345 if (n != ANEURALNETWORKS_NO_ERROR) { 346 return n; 347 } 348 } 349 350 // TODO: Move compilation elsewhere? 351 352 if (mDevice == nullptr) { 353 return ANEURALNETWORKS_NO_ERROR; 354 } 355 356 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation"; 357 return compile(mDevice, mSubModel.get(), &mPreparedSubModel); 358 } 359 360 void ExecutionStep::dump() const { 361 Model model; 362 mSubModel->setHidlModel(&model); 363 if (VLOG_IS_ON(COMPILATION)) { 364 VLOG(COMPILATION) << "ExecutionStep#" << mIndex 365 << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName()); 366 logModelToInfo(model); 367 } 368 } 369 370 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel) { 371 findSubModelOutputs(); 372 for (const auto& step : mSteps) { 373 int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize); 374 if (n != ANEURALNETWORKS_NO_ERROR) { 375 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed"; 376 return n; 377 } 378 } 379 if (mHasSubModelOutputOfUnknownSize) { 380 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize"; 381 return ANEURALNETWORKS_OP_FAILED; 382 } 383 384 mSuccessfulFinish = true; 385 return ANEURALNETWORKS_NO_ERROR; 386 } 387 388 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel) { 389 if (mDevice == nullptr) { 390 mSuccessfulFinish = true; 391 return ANEURALNETWORKS_NO_ERROR; 392 } 393 394 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation"; 395 const int n = compile(mDevice, mModel, &mPreparedModel); 396 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR); 397 return n; 398 } 399 400 int ExecutionPlan::finish(const ModelBuilder* fromModel) { 401 nnAssert(mBody != nullptr); 402 return mBody->finish(fromModel); 403 } 404 405 ExecutionPlan::Controller::Controller( 406 const ExecutionPlan* plan, 407 const ExecutionBuilder* executionBuilder, 408 std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs, 409 uint32_t totalSizeOfTemporaries) : 410 mPlan(plan), mExecutionBuilder(executionBuilder), 411 mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) { 412 if (totalSizeOfTemporaries) { 413 if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) { 414 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries"; 415 mNextStepIndex = kBadStepIndex; 416 } 417 } 418 } 419 420 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController( 421 const ExecutionBuilder* executionBuilder) const { 422 nnAssert((mState == EMPTY) == (mBody == nullptr)); 423 if (mBody && !mBody->mSuccessfulFinish) { 424 VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish"; 425 return std::shared_ptr<Controller>(nullptr); 426 } 427 428 // Create the layout for a Memory object big enough for to hold 429 // every TEMPORARY in the original model that is live across 430 // partition boundaries. 431 // 432 // TODO: Rethink this approach for managing temporaries. Some 433 // alternatives: 434 // 435 // 1) Adopt a memory layout scheme analogous to stack allocation, 436 // where objects of non-overlapping lifetime can occupy the same 437 // storage. We would still have a single Memory object in this 438 // case. 439 // 440 // 2) Do something like what CpuExecutor does, and do allocations 441 // and deallocations on the fly (during execution) before first 442 // reference and after last reference, respectively. This would 443 // mean having one Memory object per TEMPORARY; or, in a more 444 // complicated implementation, one Memory object per set of 445 // temporaries that have the same lifetime. Note that the Android 446 // system limits the number of shared memory objects, which are 447 // what our Memory objects represent. 448 // 449 uint32_t totalSizeOfTemporaries = 0; 450 std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs; 451 if (mState == COMPOUND) { 452 const ModelBuilder* fromModel = executionBuilder->getModel(); 453 for (const auto& step : compound()->mSteps) { 454 for (const auto& output: step->getSubModelOutputs()) { 455 const uint32_t fromModelOperandIndex = output.first; 456 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex); 457 if (subModelInputsAndOutputs == nullptr) { 458 subModelInputsAndOutputs = 459 std::make_shared<Controller::SubModelInputsAndOutputsType>(); 460 } 461 const uint32_t size = sizeOfData(fromModelOperand); 462 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size); 463 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries)); 464 totalSizeOfTemporaries += size; 465 } 466 } 467 } 468 469 return std::shared_ptr<Controller>(new Controller(this, executionBuilder, 470 subModelInputsAndOutputs, 471 totalSizeOfTemporaries)); 472 } 473 474 475 // TODO: Find a better way to provide this functionality. 476 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller, 477 std::shared_ptr<StepExecutor>* executor) const { 478 *executor = nullptr; 479 480 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor 481 << "): mNextStepIndex = " << controller->mNextStepIndex; 482 483 if (controller->mNextStepIndex == 0) { 484 // We haven't called next(). 485 return ANEURALNETWORKS_OP_FAILED; 486 } 487 488 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 489 // The last call to next() did not produce an executor. 490 return ANEURALNETWORKS_OP_FAILED; 491 } 492 493 --controller->mNextStepIndex; 494 return next(controller, executor); 495 } 496 497 int ExecutionPlan::next(std::shared_ptr<Controller> controller, 498 std::shared_ptr<StepExecutor>* executor) const { 499 *executor = nullptr; 500 501 VLOG(EXECUTION) << "ExecutionPlan::next(" << controller << ", " << executor 502 << "): mNextStepIndex = " << controller->mNextStepIndex; 503 504 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 505 return ANEURALNETWORKS_OP_FAILED; 506 } 507 508 if (mState == EMPTY) { 509 nnAssert(controller->mNextStepIndex == 0); // end 510 controller->mNextStepIndex = Controller::kBadStepIndex; 511 return ANEURALNETWORKS_NO_ERROR; 512 } 513 514 if (mState == SIMPLE) { 515 if (controller->mNextStepIndex == 0) { 516 // First (and only) step. 517 auto simpleBody = static_cast<const SimpleBody*>(mBody); 518 *executor = std::make_shared<StepExecutor>( 519 controller->mExecutionBuilder, 520 simpleBody->mModel, 521 (simpleBody->mDevice == nullptr ? sp<IDevice>() : simpleBody->mDevice->getInterface()), 522 simpleBody->mPreparedModel); 523 (*executor)->mapInputsAndOutputsTrivially(); 524 controller->mNextStepIndex = 1; 525 return ANEURALNETWORKS_NO_ERROR; 526 } 527 528 nnAssert(controller->mNextStepIndex == 1); // end 529 controller->mNextStepIndex = Controller::kBadStepIndex; 530 return ANEURALNETWORKS_NO_ERROR; 531 } 532 533 auto compoundBody = compound(); 534 535 if (controller->mNextStepIndex == compoundBody->mSteps.size()) { 536 // end 537 controller->mNextStepIndex = Controller::kBadStepIndex; 538 return ANEURALNETWORKS_NO_ERROR; 539 } 540 541 const auto step = compoundBody->mSteps[controller->mNextStepIndex]; 542 *executor = std::make_shared<StepExecutor>( 543 controller->mExecutionBuilder, 544 step->getSubModel().get(), 545 (step->getDevice() == nullptr ? sp<IDevice>() : step->getDevice()->getInterface()), 546 step->getPreparedSubModel()); 547 step->mapInputsAndOutputs(*executor); 548 if (controller->mSubModelInputsAndOutputs != nullptr) { 549 { 550 // Tell executor about submodel outputs. 551 552 const size_t firstSubModelOutputIndex = step->getModelOutputs().size(); 553 const auto& subModelOutputs = step->getSubModelOutputs(); 554 555 uint32_t idx = 0; 556 for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) { 557 const uint32_t fromModelOperandIndex = I->first; 558 const uint32_t offsetOfTemporary = 559 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex); 560 int n = (*executor)->setOutputFromTemporaryMemory( 561 firstSubModelOutputIndex + idx, 562 &controller->mTemporaries, 563 offsetOfTemporary); 564 if (n != ANEURALNETWORKS_NO_ERROR) { 565 controller->mNextStepIndex = Controller::kBadStepIndex; 566 return n; 567 } 568 } 569 } 570 { 571 // Tell executor about submodel inputs. 572 573 const size_t firstSubModelInputIndex = step->getModelInputs().size(); 574 const auto& subModelInputs = step->getSubModelInputs(); 575 576 uint32_t idx = 0; 577 for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) { 578 const uint32_t fromModelOperandIndex = I->first; 579 const uint32_t offsetOfTemporary = 580 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex); 581 int n = (*executor)->setInputFromTemporaryMemory( 582 firstSubModelInputIndex + idx, 583 &controller->mTemporaries, 584 offsetOfTemporary); 585 if (n != ANEURALNETWORKS_NO_ERROR) { 586 controller->mNextStepIndex = Controller::kBadStepIndex; 587 return n; 588 } 589 } 590 } 591 } 592 controller->mNextStepIndex++; 593 return ANEURALNETWORKS_NO_ERROR; 594 } 595 596 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) { 597 nnAssert(mState != SIMPLE); 598 if (mState == EMPTY) { 599 mBody = new CompoundBody(); 600 mState = COMPOUND; 601 } 602 auto& steps = compound()->mSteps; 603 auto step = std::make_shared<ExecutionStep>( 604 this, steps.size(), std::make_shared<ModelBuilder>(), device); 605 steps.push_back(step); 606 return step; 607 } 608 609 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device, 610 const ModelBuilder* model) { 611 nnAssert(mState == EMPTY); 612 mBody = new SimpleBody(device, model); 613 mState = SIMPLE; 614 } 615 616 void ExecutionPlan::dump() const { 617 if (mBody) { 618 mBody->dump(); 619 } else { 620 VLOG(COMPILATION) << "EMPTY"; 621 } 622 } 623 624 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const { 625 switch (mState) { 626 case EMPTY: 627 return Kind::EMPTY; 628 case SIMPLE: 629 nnAssert(mBody); 630 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR; 631 case COMPOUND: 632 nnAssert(mBody); 633 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR; 634 default: 635 nnAssert(!"unexpected state"); 636 return Kind::ERROR; 637 } 638 } 639 640 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const { 641 nnAssert(mState == SIMPLE); 642 return static_cast<const SimpleBody*>(mBody)->mDevice; 643 } 644 645 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const { 646 return compound()->mSteps; 647 } 648 649 void ExecutionPlan::SimpleBody::dump() const { 650 VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName()); 651 } 652 653 void ExecutionPlan::CompoundBody::dump() const { 654 for (const auto& step : mSteps) { 655 step->dump(); 656 } 657 } 658 659 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, 660 uint32_t preference, ExecutionPlan* plan) const { 661 // This function uses a heuristic approach to partitioning the graph. 662 // It should be good enough for the first release. 663 664 const size_t nonCpuDeviceCount = devices.size(); 665 // The device count is the number of HAL devices + 1. The +1 is for the CPU. 666 // Note that deviceCount includes CPU, which has no entry in devices[]. 667 const size_t deviceCount = nonCpuDeviceCount + 1; 668 const size_t operationCount = mOperations.size(); 669 670 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount 671 << ", operationCount = " << operationCount; 672 673 // If we only have the CPU, or if the graph has no operations, no 674 // need to try to partition. 675 if (deviceCount == 1 || operationCount == 0) { 676 plan->becomeSingleStep(nullptr /* CPU */, this); 677 return plan->finish(this); 678 } 679 680 // Figure out where each operation will best execute. 681 // The value of the vector is the index in the devices vector, with devices.size() 682 // representing the CPU. 683 std::vector<int> bestDeviceForOperation(operationCount); 684 findBestDeviceForEachOperation(preference, devices, operationCount, deviceCount, 685 &bestDeviceForOperation); 686 687 // If one device will run all the operations, we don't need to split the work. 688 if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(), 689 std::not_equal_to<int>()) == bestDeviceForOperation.end()) { 690 const int bestDeviceIndex = bestDeviceForOperation[0]; 691 const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1); 692 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: " 693 << bestDeviceIndex << " = " 694 << (cpu ? "CPU" : devices[bestDeviceIndex]->getName()); 695 plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this); 696 return plan->finish(this); 697 } 698 699 // No easy solution, we need to split the work. 700 701 // We keep track of the operations that are ready to run for each device. 702 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount); 703 704 // This helper function enqueues the operation on the appropriate queue. 705 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) { 706 int deviceIndex = bestDeviceForOperation[operationIndex]; 707 perDeviceQueue[deviceIndex].push(operationIndex); 708 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto " 709 << deviceIndex; 710 }; 711 712 // This helper function finds a device that has operations ready to process. 713 // We start by looking at the CPU. We do this to try to maximize the 714 // size of the graph we'll send to non-CPU devices. If the CPU runs first, 715 // it will have the chance to prepare more of the inputs required by the 716 // other devices. This function returns -1 if all queues are empty. 717 auto findNextDeviceToProcess = [&]() -> int { 718 for (int i = deviceCount - 1; i >= 0; i--) { 719 if (!perDeviceQueue[i].empty()) { 720 return i; 721 } 722 } 723 return -1; 724 }; 725 726 OperandTracker tracker(this, enqueueOnAppropriateDevice); 727 // For each iteration of this loop, we'll create an execution step. 728 while (true) { 729 // Find the device we'll do this step for. 730 int deviceIndex = findNextDeviceToProcess(); 731 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex; 732 if (deviceIndex < 0) { 733 break; 734 } 735 // nullptr represents the CPU. 736 std::shared_ptr<Device> device = 737 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount 738 ? devices[deviceIndex] : nullptr; 739 740 // Assign as much as possible to this device. 741 std::shared_ptr<ExecutionStep> step = plan->createNewStep(device); 742 auto& queue = perDeviceQueue[deviceIndex]; 743 while (!queue.empty()) { 744 uint32_t operationIndex = queue.front(); 745 queue.pop(); 746 step->addOperation(operationIndex, *this); 747 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); 748 } 749 } 750 751 int n = plan->finish(this); 752 if (VLOG_IS_ON(COMPILATION)) { 753 Model model; 754 setHidlModel(&model); 755 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: "; 756 logModelToInfo(model); 757 plan->dump(); 758 } 759 return n; 760 } 761 762 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device, 763 uint32_t operationIndex) const { 764 const Operation& operation = getOperation(operationIndex); 765 // TODO This assumes that the type is dictated by the first operand. This is 766 // currently the case but is not a safe assumption to make in the long term. 767 const uint32_t operandIndex = operation.inputs[0]; 768 const OperandType operandType = mOperands[operandIndex].type; 769 switch(operandType) { 770 case OperandType::FLOAT32: 771 case OperandType::TENSOR_FLOAT32: 772 return device->getFloat32Performance(); 773 case OperandType::INT32: 774 case OperandType::UINT32: 775 case OperandType::TENSOR_INT32: 776 case OperandType::TENSOR_QUANT8_ASYMM: 777 // For OEM, the real selection will be made from who can run the operand. 778 case OperandType::OEM: 779 case OperandType::TENSOR_OEM_BYTE: 780 return device->getQuantized8Performance(); 781 default: 782 nnAssert(false); 783 return device->getQuantized8Performance(); 784 } 785 } 786 787 namespace { 788 // This class determines whether a given device can execute a given operation 789 class CanDo { 790 public: 791 CanDo() {} 792 793 void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) { 794 Model hidlModel; 795 model->setHidlModel(&hidlModel); 796 device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex); 797 } 798 799 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; } 800 801 private: 802 hidl_vec<bool> mSupportsOperationByIndex; 803 }; 804 }; // anonymous namespace 805 806 int ModelBuilder::findBestDeviceForEachOperation( 807 uint32_t preference, 808 const std::vector<std::shared_ptr<Device>>& devices, 809 const size_t operationCount, [[maybe_unused]] const size_t deviceCount, 810 std::vector<int>* bestDeviceForOperation) const { 811 812 // Note that deviceCount includes CPU, which has no entry in devices[] 813 const size_t nonCpuDeviceCount = deviceCount - 1; 814 815 std::vector<CanDo> canDo(nonCpuDeviceCount); 816 for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) { 817 canDo[deviceIndex].initialize(this, devices[deviceIndex]); 818 } 819 820 // Figure out the best driver for each operation. 821 // 822 // TODO: If the best driver is inferior (higher-power or 823 // longer-running, depending on preference) than the CPU, then we 824 // should use the CPU. We could do this by setting bestChoice 825 // initially to the number representing the CPU 826 // (nonCpuDeviceCount) and bestPerfVal to the CPU value. Problem 827 // is, we have no such number now, so that will have to be for 828 // release P or later. One option is that the float performance 829 // is a ratio of device/cpu rather than a number in joules or 830 // microseconds. 831 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { 832 int bestChoice = -1; 833 float bestPerfVal = 0.0; // do not check bestPerfVal unless we have bestChoice >= 0 834 for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) { 835 if (canDo[deviceIndex].check(operationIndex)) { 836 const auto& device = devices[deviceIndex]; 837 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex); 838 const float perfVal = 839 (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage 840 : perf.execTime); 841 if ((bestChoice >= 0) && (bestPerfVal <= perfVal)) { 842 continue; 843 } 844 bestChoice = deviceIndex; 845 bestPerfVal = perfVal; 846 } 847 } 848 // No drivers are available for this operation, so choose the CPU. 849 // TODO What if it is an OEM op? 850 (*bestDeviceForOperation)[operationIndex] = 851 bestChoice >= 0 ? bestChoice : static_cast<int>(nonCpuDeviceCount); 852 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" 853 << toString(getOperation(operationIndex).type) 854 << ") = " 855 << (*bestDeviceForOperation)[operationIndex]; 856 } 857 return ANEURALNETWORKS_NO_ERROR; 858 } 859 860 } // namespace nn 861 } // namespace android 862