Home | History | Annotate | Download | only in runtime
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define LOG_TAG "ExecutionPlan"
     18 
     19 #include "ExecutionPlan.h"
     20 
     21 #include "Callbacks.h"
     22 #include "CompilationBuilder.h"
     23 #include "ExecutionBuilder.h"
     24 #include "Manager.h"
     25 #include "ModelBuilder.h"
     26 #include "Utils.h"
     27 
     28 #include <functional>
     29 #include <map>
     30 #include <queue>
     31 #include <unordered_set>
     32 #include <utility>
     33 #include <vector>
     34 
     35 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback;
     36 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback;
     37 
     38 namespace android {
     39 namespace nn {
     40 
     41 static int compile(std::shared_ptr<Device> device,
     42                    const ModelBuilder* model,
     43                    sp<IPreparedModel>* preparedModel) {
     44     nnAssert(device != nullptr);  // nullptr indicates CPU
     45     // Compilation logic copied from ExecutionBuilder::startComputeOnDevice().
     46     Model hidlModel;
     47     model->setHidlModel(&hidlModel);
     48 
     49     sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
     50     Return<ErrorStatus> prepareLaunchStatus =
     51             device->getInterface()->prepareModel(hidlModel, preparedModelCallback);
     52     if (!prepareLaunchStatus.isOk()) {
     53         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: "
     54                    << prepareLaunchStatus.description();
     55         return ANEURALNETWORKS_OP_FAILED;
     56     }
     57     if (prepareLaunchStatus != ErrorStatus::NONE) {
     58         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: "
     59                    << toString(static_cast<ErrorStatus>(prepareLaunchStatus));
     60         return ANEURALNETWORKS_OP_FAILED;
     61     }
     62 
     63     preparedModelCallback->wait();
     64     ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus();
     65     *preparedModel = preparedModelCallback->getPreparedModel();
     66     if (prepareReturnStatus != ErrorStatus::NONE || preparedModel == nullptr) {
     67         LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:"
     68                    << " prepareReturnStatus=" << toString(prepareReturnStatus)
     69                    << ", preparedModel=" << preparedModel->get();
     70         return ANEURALNETWORKS_OP_FAILED;
     71     }
     72     return ANEURALNETWORKS_NO_ERROR;
     73 }
     74 
     75 typedef std::function<void(uint32_t)> OperationReadyCallback;
     76 
     77 // This class tracks whether we know the value of an operand as operations
     78 // are processed.
     79 class OperandTracker {
     80 public:
     81     // Creates the tracker for this model. Figure out which operations can be
     82     // executed right away and cb for each one of them.
     83     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
     84     // Mark the specified operation as having been processed. The output
     85     // of the operation now being known, this may make new operations to be
     86     // able to run.  Call cb for each one of them.
     87     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
     88 
     89 private:
     90     const ModelBuilder* mModel;
     91     std::multimap<uint32_t, uint32_t> mOperandToOperations;
     92     std::vector<uint32_t> mUnknownInputCount;  // For each operation
     93 };
     94 
     95 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
     96         mModel(model) {
     97     const auto& operations = mModel->getOperations();
     98     mUnknownInputCount.resize(operations.size());
     99     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
    100         const Operation& operation = operations[operationIndex];
    101         uint32_t count = 0;
    102         for (uint32_t operandIndex : operation.inputs) {
    103             auto lifetime = mModel->getOperand(operandIndex).lifetime;
    104             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
    105                 lifetime == OperandLifeTime::MODEL_OUTPUT) {
    106                 count++;
    107                 mOperandToOperations.insert(
    108                         std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
    109             }
    110         }
    111         if (count == 0) {
    112             cb(operationIndex);
    113         }
    114         mUnknownInputCount[operationIndex] = count;
    115     }
    116 }
    117 
    118 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
    119     // Mark all its outputs as known.
    120     const Operation& operation = mModel->getOperations()[operationIndex];
    121     for (uint32_t operandIndex : operation.outputs) {
    122         auto range = mOperandToOperations.equal_range(operandIndex);
    123         for (auto i = range.first; i != range.second; i++) {
    124             uint32_t& count = mUnknownInputCount[i->second];
    125             if (--count == 0) {
    126                 cb(i->second);
    127             }
    128         }
    129     }
    130 }
    131 
    132 ExecutionStep::ExecutionStep(ExecutionPlan* plan,
    133                              uint32_t stepIndex,
    134                              std::shared_ptr<ModelBuilder> model,
    135                              std::shared_ptr<Device> device)
    136         : mPlan(plan), mIndex(stepIndex), mSubModel(model), mDevice(device) {}
    137 
    138 // Adds an operand if it has not been added already.
    139 // Sets the index in the submodel for the corresponding operand.
    140 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
    141                               const ModelBuilder& fromModel, OperandKind kind) {
    142     // Have we added this operand already?
    143     auto i = mOperandMap.find(fromOperandIndex);
    144     if (i != mOperandMap.end()) {
    145         nnAssert(kind == INPUT);
    146         *toOperandIndex = i->second;
    147         return ANEURALNETWORKS_NO_ERROR;
    148     }
    149 
    150     // First time we add this operand.
    151     *toOperandIndex = mSubModel->operandCount();
    152     mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
    153 
    154     // Add the operand to the submodel.
    155     const Operand& operand = fromModel.getOperand(fromOperandIndex);
    156     ANeuralNetworksOperandType type = {.type = static_cast<int32_t>(operand.type),
    157                                        .dimensionCount =
    158                                                static_cast<uint32_t>(operand.dimensions.size()),
    159                                        .dimensions = operand.dimensions.data(),
    160                                        .scale = operand.scale,
    161                                        .zeroPoint = operand.zeroPoint};
    162     int n = mSubModel->addOperand(type);
    163     if (n != ANEURALNETWORKS_NO_ERROR) {
    164         LOG(ERROR) << "Previous error occurred when partitioning the graph";
    165         return n;
    166     }
    167 
    168     // Sets its value.
    169     switch (operand.lifetime) {
    170         case OperandLifeTime::CONSTANT_COPY: {
    171             const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
    172             n = mSubModel->setOperandValue(*toOperandIndex, data, operand.location.length);
    173             if (n != ANEURALNETWORKS_NO_ERROR) {
    174                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
    175                 return n;
    176             }
    177         } break;
    178         case OperandLifeTime::CONSTANT_REFERENCE: {
    179             const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
    180             n = mSubModel->setOperandValueFromMemory(*toOperandIndex, memory,
    181                                                      operand.location.offset,
    182                                                      operand.location.length);
    183             if (n != ANEURALNETWORKS_NO_ERROR) {
    184                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
    185                 return n;
    186             }
    187         } break;
    188         case OperandLifeTime::NO_VALUE: {
    189             n = mSubModel->setOperandValue(*toOperandIndex, nullptr, 0);
    190             if (n != ANEURALNETWORKS_NO_ERROR) {
    191                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
    192                 return n;
    193             }
    194         } break;
    195         case OperandLifeTime::TEMPORARY_VARIABLE:
    196             if (kind == INPUT) {
    197                 // The first time we've seen this operand is as an
    198                 // input.  That means it must be defined by a
    199                 // different partition, and is an input to this one.
    200                 mSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    201             } else {
    202                 // The first time we've seen this operand is as an
    203                 // output.  It may be an input to a different
    204                 // partition, so keep track of it.
    205                 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
    206             }
    207             break;
    208         case OperandLifeTime::MODEL_INPUT:
    209             mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    210             break;
    211         case OperandLifeTime::MODEL_OUTPUT:
    212             mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    213             break;
    214         default:
    215             nnAssert(false);
    216             break;
    217     }
    218 
    219     return ANEURALNETWORKS_NO_ERROR;
    220 }
    221 
    222 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
    223     const Operation& operation = fromModel.getOperation(operationIndex);
    224 
    225     // Convert the input and output operand indexes.
    226     //
    227     // We expect operations to be added in topological order.  Therefore:
    228     //
    229     // - We may not have seen an input if it is a model input, a
    230     //   constant, or an operand written by a different partition.
    231     //
    232     // - We should not have seen any outputs.
    233     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
    234     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
    235     std::vector<uint32_t> inputs(inputCount);
    236     std::vector<uint32_t> outputs(outputCount);
    237 
    238     auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
    239                                           std::vector<uint32_t>& localOperands,
    240                                           OperandKind kind) -> int {
    241         const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
    242         for (uint32_t i = 0; i < operandCount; i++) {
    243             uint32_t localOperand = ~0U;
    244             int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
    245             if (n != ANEURALNETWORKS_NO_ERROR)
    246                 return n;
    247             localOperands[i] = localOperand;
    248         }
    249         return ANEURALNETWORKS_NO_ERROR;
    250     };
    251 
    252     int n;
    253     if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
    254         (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
    255         return n;
    256     }
    257 
    258     return mSubModel->addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
    259                                    outputCount, outputs.data());
    260 }
    261 
    262 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
    263     for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
    264         stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
    265     }
    266     for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
    267         stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
    268     }
    269 }
    270 
    271 void ExecutionPlan::CompoundBody::findSubModelOutputs() {
    272     for (const auto& step : mSteps) {
    273         for (const auto& input : step->getSubModelInputs()) {
    274             const uint32_t fromModelIndex = input.first;
    275             const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
    276             nnAssert(it != mTemporaryToDefiningStep.end());
    277             const uint32_t stepIndex = it->second;
    278             nnAssert(stepIndex < mSteps.size());
    279             mSteps[stepIndex]->recordSubModelOutput(fromModelIndex);
    280         }
    281     }
    282 }
    283 
    284 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize) {
    285     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
    286 
    287     auto convertModelInputsOrOutputs = [](
    288             // IN: mModel{Inputs|Outputs}
    289             const RemapVectorType& myModelInputsOrOutputs,
    290             // IN: fromModel->{input|output}Count()
    291             uint32_t fromModelInputOrOutputCount,
    292             // IN: fromModel->get{Input|Output}OperandIndex
    293             std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex,
    294             // OUT: for v : mModel{Inputs|Outputs} : v.second
    295             std::vector<uint32_t>* inputsOrOutputs,
    296             // OUT: submodel input-or-output index to original model input-or-output index
    297             std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) {
    298         std::map<uint32_t, uint32_t> fromModelIndexMap;  // operand index to input-or-output index
    299         for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
    300             fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
    301         }
    302         for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
    303             inputsOrOutputs->push_back(myInputOrOutput.second);
    304             const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
    305             inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
    306         }
    307     };
    308 
    309     std::vector<uint32_t> inputs;
    310     convertModelInputsOrOutputs(mModelInputs,
    311                                 fromModel->inputCount(),
    312                                 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
    313                                 &inputs,
    314                                 &mInputIndexSubModelToFromModel);
    315     for (const auto& subModelInput : mSubModelInputs) {
    316         inputs.push_back(subModelInput.second);
    317     }
    318 
    319     std::vector<uint32_t> outputs;
    320     convertModelInputsOrOutputs(mModelOutputs,
    321                                 fromModel->outputCount(),
    322                                 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
    323                                 &outputs,
    324                                 &mOutputIndexSubModelToFromModel);
    325     for (const auto& subModelOutput : mSubModelOutputs) {
    326         outputs.push_back(subModelOutput.second);
    327         const Operand& operand = mSubModel->getOperand(subModelOutput.second);
    328         for (uint32_t dimension : operand.dimensions) {
    329             if (dimension == 0) {
    330                 *hasOutputOfUnknownSize = true;
    331                 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
    332                                 << " of original graph) has unknown size: "
    333                                 << toString(operand);
    334                 break;
    335             }
    336         }
    337     }
    338 
    339     {
    340       int n = mSubModel->identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
    341       if (n != ANEURALNETWORKS_NO_ERROR) {
    342           return n;
    343       }
    344       n = mSubModel->finish();
    345       if (n != ANEURALNETWORKS_NO_ERROR) {
    346           return n;
    347       }
    348     }
    349 
    350     // TODO: Move compilation elsewhere?
    351 
    352     if (mDevice == nullptr) {
    353         return ANEURALNETWORKS_NO_ERROR;
    354     }
    355 
    356     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation";
    357     return compile(mDevice, mSubModel.get(), &mPreparedSubModel);
    358 }
    359 
    360 void ExecutionStep::dump() const {
    361     Model model;
    362     mSubModel->setHidlModel(&model);
    363     if (VLOG_IS_ON(COMPILATION)) {
    364         VLOG(COMPILATION) << "ExecutionStep#" << mIndex
    365                           << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
    366         logModelToInfo(model);
    367     }
    368 }
    369 
    370 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel) {
    371     findSubModelOutputs();
    372     for (const auto& step : mSteps) {
    373         int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize);
    374         if (n != ANEURALNETWORKS_NO_ERROR) {
    375             VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
    376             return n;
    377         }
    378     }
    379     if (mHasSubModelOutputOfUnknownSize) {
    380         VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
    381         return ANEURALNETWORKS_OP_FAILED;
    382     }
    383 
    384     mSuccessfulFinish = true;
    385     return ANEURALNETWORKS_NO_ERROR;
    386 }
    387 
    388 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel) {
    389     if (mDevice == nullptr) {
    390         mSuccessfulFinish = true;
    391         return ANEURALNETWORKS_NO_ERROR;
    392     }
    393 
    394     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
    395     const int n = compile(mDevice, mModel, &mPreparedModel);
    396     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
    397     return n;
    398 }
    399 
    400 int ExecutionPlan::finish(const ModelBuilder* fromModel) {
    401     nnAssert(mBody != nullptr);
    402     return mBody->finish(fromModel);
    403 }
    404 
    405 ExecutionPlan::Controller::Controller(
    406     const ExecutionPlan* plan,
    407     const ExecutionBuilder* executionBuilder,
    408     std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
    409     uint32_t totalSizeOfTemporaries) :
    410         mPlan(plan), mExecutionBuilder(executionBuilder),
    411         mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) {
    412     if (totalSizeOfTemporaries) {
    413         if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
    414             LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
    415             mNextStepIndex = kBadStepIndex;
    416         }
    417     }
    418 }
    419 
    420 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
    421     const ExecutionBuilder* executionBuilder) const {
    422     nnAssert((mState == EMPTY) == (mBody == nullptr));
    423     if (mBody && !mBody->mSuccessfulFinish) {
    424         VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish";
    425         return std::shared_ptr<Controller>(nullptr);
    426     }
    427 
    428     // Create the layout for a Memory object big enough for to hold
    429     // every TEMPORARY in the original model that is live across
    430     // partition boundaries.
    431     //
    432     // TODO: Rethink this approach for managing temporaries.  Some
    433     // alternatives:
    434     //
    435     // 1) Adopt a memory layout scheme analogous to stack allocation,
    436     // where objects of non-overlapping lifetime can occupy the same
    437     // storage.  We would still have a single Memory object in this
    438     // case.
    439     //
    440     // 2) Do something like what CpuExecutor does, and do allocations
    441     // and deallocations on the fly (during execution) before first
    442     // reference and after last reference, respectively.  This would
    443     // mean having one Memory object per TEMPORARY; or, in a more
    444     // complicated implementation, one Memory object per set of
    445     // temporaries that have the same lifetime.  Note that the Android
    446     // system limits the number of shared memory objects, which are
    447     // what our Memory objects represent.
    448     //
    449     uint32_t totalSizeOfTemporaries = 0;
    450     std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
    451     if (mState == COMPOUND) {
    452         const ModelBuilder* fromModel = executionBuilder->getModel();
    453         for (const auto& step : compound()->mSteps) {
    454             for (const auto& output: step->getSubModelOutputs()) {
    455                 const uint32_t fromModelOperandIndex = output.first;
    456                 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
    457                 if (subModelInputsAndOutputs == nullptr) {
    458                     subModelInputsAndOutputs =
    459                             std::make_shared<Controller::SubModelInputsAndOutputsType>();
    460                 }
    461                 const uint32_t size = sizeOfData(fromModelOperand);
    462                 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
    463                 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
    464                 totalSizeOfTemporaries += size;
    465             }
    466         }
    467     }
    468 
    469     return std::shared_ptr<Controller>(new Controller(this, executionBuilder,
    470                                                       subModelInputsAndOutputs,
    471                                                       totalSizeOfTemporaries));
    472 }
    473 
    474 
    475 // TODO: Find a better way to provide this functionality.
    476 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
    477                             std::shared_ptr<StepExecutor>* executor) const {
    478     *executor = nullptr;
    479 
    480     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
    481                     << "): mNextStepIndex = " << controller->mNextStepIndex;
    482 
    483     if (controller->mNextStepIndex == 0) {
    484         // We haven't called next().
    485         return ANEURALNETWORKS_OP_FAILED;
    486     }
    487 
    488     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
    489         // The last call to next() did not produce an executor.
    490         return ANEURALNETWORKS_OP_FAILED;
    491     }
    492 
    493     --controller->mNextStepIndex;
    494     return next(controller, executor);
    495 }
    496 
    497 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
    498                         std::shared_ptr<StepExecutor>* executor) const {
    499     *executor = nullptr;
    500 
    501     VLOG(EXECUTION) << "ExecutionPlan::next(" << controller << ", " << executor
    502                     << "): mNextStepIndex = " << controller->mNextStepIndex;
    503 
    504     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
    505         return ANEURALNETWORKS_OP_FAILED;
    506     }
    507 
    508     if (mState == EMPTY) {
    509         nnAssert(controller->mNextStepIndex == 0);  // end
    510         controller->mNextStepIndex = Controller::kBadStepIndex;
    511         return ANEURALNETWORKS_NO_ERROR;
    512     }
    513 
    514     if (mState == SIMPLE) {
    515         if (controller->mNextStepIndex == 0) {
    516             // First (and only) step.
    517             auto simpleBody = static_cast<const SimpleBody*>(mBody);
    518             *executor = std::make_shared<StepExecutor>(
    519                 controller->mExecutionBuilder,
    520                 simpleBody->mModel,
    521                 (simpleBody->mDevice == nullptr ? sp<IDevice>() : simpleBody->mDevice->getInterface()),
    522                 simpleBody->mPreparedModel);
    523             (*executor)->mapInputsAndOutputsTrivially();
    524             controller->mNextStepIndex = 1;
    525             return ANEURALNETWORKS_NO_ERROR;
    526         }
    527 
    528         nnAssert(controller->mNextStepIndex == 1);  // end
    529         controller->mNextStepIndex = Controller::kBadStepIndex;
    530         return ANEURALNETWORKS_NO_ERROR;
    531     }
    532 
    533     auto compoundBody = compound();
    534 
    535     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
    536         // end
    537         controller->mNextStepIndex = Controller::kBadStepIndex;
    538         return ANEURALNETWORKS_NO_ERROR;
    539     }
    540 
    541     const auto step = compoundBody->mSteps[controller->mNextStepIndex];
    542     *executor = std::make_shared<StepExecutor>(
    543         controller->mExecutionBuilder,
    544         step->getSubModel().get(),
    545         (step->getDevice() == nullptr ? sp<IDevice>() : step->getDevice()->getInterface()),
    546         step->getPreparedSubModel());
    547     step->mapInputsAndOutputs(*executor);
    548     if (controller->mSubModelInputsAndOutputs != nullptr) {
    549         {
    550             // Tell executor about submodel outputs.
    551 
    552             const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
    553             const auto& subModelOutputs = step->getSubModelOutputs();
    554 
    555             uint32_t idx = 0;
    556             for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
    557                 const uint32_t fromModelOperandIndex = I->first;
    558                 const uint32_t offsetOfTemporary =
    559                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
    560                 int n = (*executor)->setOutputFromTemporaryMemory(
    561                     firstSubModelOutputIndex + idx,
    562                     &controller->mTemporaries,
    563                     offsetOfTemporary);
    564                 if (n != ANEURALNETWORKS_NO_ERROR) {
    565                     controller->mNextStepIndex = Controller::kBadStepIndex;
    566                     return n;
    567                 }
    568             }
    569         }
    570         {
    571             // Tell executor about submodel inputs.
    572 
    573             const size_t firstSubModelInputIndex = step->getModelInputs().size();
    574             const auto& subModelInputs = step->getSubModelInputs();
    575 
    576             uint32_t idx = 0;
    577             for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
    578                 const uint32_t fromModelOperandIndex = I->first;
    579                 const uint32_t offsetOfTemporary =
    580                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
    581                 int n = (*executor)->setInputFromTemporaryMemory(
    582                     firstSubModelInputIndex + idx,
    583                     &controller->mTemporaries,
    584                     offsetOfTemporary);
    585                 if (n != ANEURALNETWORKS_NO_ERROR) {
    586                     controller->mNextStepIndex = Controller::kBadStepIndex;
    587                     return n;
    588                 }
    589             }
    590         }
    591     }
    592     controller->mNextStepIndex++;
    593     return ANEURALNETWORKS_NO_ERROR;
    594 }
    595 
    596 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
    597     nnAssert(mState != SIMPLE);
    598     if (mState == EMPTY) {
    599         mBody = new CompoundBody();
    600         mState = COMPOUND;
    601     }
    602     auto& steps = compound()->mSteps;
    603     auto step = std::make_shared<ExecutionStep>(
    604         this, steps.size(), std::make_shared<ModelBuilder>(), device);
    605     steps.push_back(step);
    606     return step;
    607 }
    608 
    609 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
    610                                      const ModelBuilder* model) {
    611     nnAssert(mState == EMPTY);
    612     mBody = new SimpleBody(device, model);
    613     mState = SIMPLE;
    614 }
    615 
    616 void ExecutionPlan::dump() const {
    617     if (mBody) {
    618         mBody->dump();
    619     } else {
    620         VLOG(COMPILATION) << "EMPTY";
    621     }
    622 }
    623 
    624 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
    625     switch (mState) {
    626         case EMPTY:
    627             return Kind::EMPTY;
    628         case SIMPLE:
    629             nnAssert(mBody);
    630             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
    631         case COMPOUND:
    632             nnAssert(mBody);
    633             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
    634         default:
    635             nnAssert(!"unexpected state");
    636             return Kind::ERROR;
    637     }
    638 }
    639 
    640 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
    641     nnAssert(mState == SIMPLE);
    642     return static_cast<const SimpleBody*>(mBody)->mDevice;
    643 }
    644 
    645 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
    646     return compound()->mSteps;
    647 }
    648 
    649 void ExecutionPlan::SimpleBody::dump() const {
    650     VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
    651 }
    652 
    653 void ExecutionPlan::CompoundBody::dump() const {
    654     for (const auto& step : mSteps) {
    655         step->dump();
    656     }
    657 }
    658 
    659 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
    660                                    uint32_t preference, ExecutionPlan* plan) const {
    661     // This function uses a heuristic approach to partitioning the graph.
    662     // It should be good enough for the first release.
    663 
    664     const size_t nonCpuDeviceCount = devices.size();
    665     // The device count is the number of HAL devices + 1. The +1 is for the CPU.
    666     // Note that deviceCount includes CPU, which has no entry in devices[].
    667     const size_t deviceCount = nonCpuDeviceCount + 1;
    668     const size_t operationCount = mOperations.size();
    669 
    670     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
    671                       << ", operationCount = " << operationCount;
    672 
    673     // If we only have the CPU, or if the graph has no operations, no
    674     // need to try to partition.
    675     if (deviceCount == 1 || operationCount == 0) {
    676         plan->becomeSingleStep(nullptr /* CPU */, this);
    677         return plan->finish(this);
    678     }
    679 
    680     // Figure out where each operation will best execute.
    681     // The value of the vector is the index in the devices vector, with devices.size()
    682     // representing the CPU.
    683     std::vector<int> bestDeviceForOperation(operationCount);
    684     findBestDeviceForEachOperation(preference, devices, operationCount, deviceCount,
    685                                    &bestDeviceForOperation);
    686 
    687     // If one device will run all the operations, we don't need to split the work.
    688     if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
    689                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
    690         const int bestDeviceIndex = bestDeviceForOperation[0];
    691         const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1);
    692         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
    693                           << bestDeviceIndex << " = "
    694                           << (cpu ? "CPU" : devices[bestDeviceIndex]->getName());
    695         plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this);
    696         return plan->finish(this);
    697     }
    698 
    699     // No easy solution, we need to split the work.
    700 
    701     // We keep track of the operations that are ready to run for each device.
    702     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
    703 
    704     // This helper function enqueues the operation on the appropriate queue.
    705     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
    706         int deviceIndex = bestDeviceForOperation[operationIndex];
    707         perDeviceQueue[deviceIndex].push(operationIndex);
    708         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
    709                           << deviceIndex;
    710     };
    711 
    712     // This helper function finds a device that has operations ready to process.
    713     // We start by looking at the CPU. We do this to try to maximize the
    714     // size of the graph we'll send to non-CPU devices. If the CPU runs first,
    715     // it will have the chance to prepare more of the inputs required by the
    716     // other devices. This function returns -1 if all queues are empty.
    717     auto findNextDeviceToProcess = [&]() -> int {
    718         for (int i = deviceCount - 1; i >= 0; i--) {
    719             if (!perDeviceQueue[i].empty()) {
    720                 return i;
    721             }
    722         }
    723         return -1;
    724     };
    725 
    726     OperandTracker tracker(this, enqueueOnAppropriateDevice);
    727     // For each iteration of this loop, we'll create an execution step.
    728     while (true) {
    729         // Find the device we'll do this step for.
    730         int deviceIndex = findNextDeviceToProcess();
    731         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
    732         if (deviceIndex < 0) {
    733             break;
    734         }
    735         // nullptr represents the CPU.
    736         std::shared_ptr<Device> device =
    737                 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount
    738                         ? devices[deviceIndex] : nullptr;
    739 
    740         // Assign as much as possible to this device.
    741         std::shared_ptr<ExecutionStep> step = plan->createNewStep(device);
    742         auto& queue = perDeviceQueue[deviceIndex];
    743         while (!queue.empty()) {
    744             uint32_t operationIndex = queue.front();
    745             queue.pop();
    746             step->addOperation(operationIndex, *this);
    747             tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
    748         }
    749     }
    750 
    751     int n = plan->finish(this);
    752     if (VLOG_IS_ON(COMPILATION)) {
    753         Model model;
    754         setHidlModel(&model);
    755         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
    756         logModelToInfo(model);
    757         plan->dump();
    758     }
    759     return n;
    760 }
    761 
    762 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
    763                                                  uint32_t operationIndex) const {
    764     const Operation& operation = getOperation(operationIndex);
    765     // TODO This assumes that the type is dictated by the first operand. This is
    766     // currently the case but is not a safe assumption to make in the long term.
    767     const uint32_t operandIndex = operation.inputs[0];
    768     const OperandType operandType = mOperands[operandIndex].type;
    769     switch(operandType) {
    770         case OperandType::FLOAT32:
    771         case OperandType::TENSOR_FLOAT32:
    772             return device->getFloat32Performance();
    773         case OperandType::INT32:
    774         case OperandType::UINT32:
    775         case OperandType::TENSOR_INT32:
    776         case OperandType::TENSOR_QUANT8_ASYMM:
    777             // For OEM, the real selection will be made from who can run the operand.
    778         case OperandType::OEM:
    779         case OperandType::TENSOR_OEM_BYTE:
    780             return device->getQuantized8Performance();
    781         default:
    782             nnAssert(false);
    783             return device->getQuantized8Performance();
    784     }
    785 }
    786 
    787 namespace {
    788 // This class determines whether a given device can execute a given operation
    789 class CanDo {
    790 public:
    791     CanDo() {}
    792 
    793     void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) {
    794         Model hidlModel;
    795         model->setHidlModel(&hidlModel);
    796         device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex);
    797     }
    798 
    799     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
    800 
    801 private:
    802     hidl_vec<bool> mSupportsOperationByIndex;
    803 };
    804 };  // anonymous namespace
    805 
    806 int ModelBuilder::findBestDeviceForEachOperation(
    807         uint32_t preference,
    808         const std::vector<std::shared_ptr<Device>>& devices,
    809         const size_t operationCount, [[maybe_unused]] const size_t deviceCount,
    810         std::vector<int>* bestDeviceForOperation) const {
    811 
    812     // Note that deviceCount includes CPU, which has no entry in devices[]
    813     const size_t nonCpuDeviceCount = deviceCount - 1;
    814 
    815     std::vector<CanDo> canDo(nonCpuDeviceCount);
    816     for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
    817         canDo[deviceIndex].initialize(this, devices[deviceIndex]);
    818     }
    819 
    820     // Figure out the best driver for each operation.
    821     //
    822     // TODO: If the best driver is inferior (higher-power or
    823     // longer-running, depending on preference) than the CPU, then we
    824     // should use the CPU.  We could do this by setting bestChoice
    825     // initially to the number representing the CPU
    826     // (nonCpuDeviceCount) and bestPerfVal to the CPU value.  Problem
    827     // is, we have no such number now, so that will have to be for
    828     // release P or later.  One option is that the float performance
    829     // is a ratio of device/cpu rather than a number in joules or
    830     // microseconds.
    831     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
    832         int bestChoice = -1;
    833         float bestPerfVal = 0.0;  // do not check bestPerfVal unless we have bestChoice >= 0
    834         for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
    835             if (canDo[deviceIndex].check(operationIndex)) {
    836                 const auto& device = devices[deviceIndex];
    837                 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
    838                 const float perfVal =
    839                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
    840                                                                             : perf.execTime);
    841                 if ((bestChoice >= 0) && (bestPerfVal <= perfVal)) {
    842                     continue;
    843                 }
    844                 bestChoice = deviceIndex;
    845                 bestPerfVal = perfVal;
    846             }
    847         }
    848         // No drivers are available for this operation, so choose the CPU.
    849         // TODO What if it is an OEM op?
    850         (*bestDeviceForOperation)[operationIndex] =
    851                 bestChoice >= 0 ? bestChoice : static_cast<int>(nonCpuDeviceCount);
    852         VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
    853                           << toString(getOperation(operationIndex).type)
    854                           << ") = "
    855                           << (*bestDeviceForOperation)[operationIndex];
    856     }
    857     return ANEURALNETWORKS_NO_ERROR;
    858 }
    859 
    860 } // namespace nn
    861 } // namespace android
    862