Home | History | Annotate | Download | only in runtime
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define LOG_TAG "ExecutionPlan"
     18 
     19 #include "ExecutionPlan.h"
     20 
     21 #include "Callbacks.h"
     22 #include "CompilationBuilder.h"
     23 #include "ExecutionBuilder.h"
     24 #include "Manager.h"
     25 #include "ModelBuilder.h"
     26 #include "Utils.h"
     27 
     28 #include <functional>
     29 #include <map>
     30 #include <queue>
     31 #include <unordered_set>
     32 #include <utility>
     33 #include <vector>
     34 
     35 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback;
     36 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback;
     37 
     38 namespace android {
     39 namespace nn {
     40 
     41 static int compile(std::shared_ptr<Device> device, const ModelBuilder* model,
     42                    int32_t executionPreference, sp<IPreparedModel>* preparedModel) {
     43     nnAssert(device != nullptr);  // nullptr indicates CPU
     44     // Compilation logic copied from ExecutionBuilder::startComputeOnDevice().
     45     Model hidlModel;
     46     model->setHidlModel(&hidlModel);
     47 
     48     sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
     49     Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
     50         hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
     51     if (!prepareLaunchStatus.isOk()) {
     52         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: "
     53                    << prepareLaunchStatus.description();
     54         return ANEURALNETWORKS_OP_FAILED;
     55     }
     56     if (prepareLaunchStatus != ErrorStatus::NONE) {
     57         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: "
     58                    << toString(static_cast<ErrorStatus>(prepareLaunchStatus));
     59         return ANEURALNETWORKS_OP_FAILED;
     60     }
     61 
     62     preparedModelCallback->wait();
     63     ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus();
     64     *preparedModel = preparedModelCallback->getPreparedModel();
     65     if (prepareReturnStatus != ErrorStatus::NONE || *preparedModel == nullptr) {
     66         LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:"
     67                    << " prepareReturnStatus=" << toString(prepareReturnStatus)
     68                    << ", preparedModel=" << preparedModel->get();
     69         return ANEURALNETWORKS_OP_FAILED;
     70     }
     71     return ANEURALNETWORKS_NO_ERROR;
     72 }
     73 
     74 typedef std::function<void(uint32_t)> OperationReadyCallback;
     75 
     76 // This class tracks whether we know the value of an operand as operations
     77 // are processed.
     78 class OperandTracker {
     79 public:
     80     // Creates the tracker for this model. Figure out which operations can be
     81     // executed right away and cb for each one of them.
     82     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
     83     // Mark the specified operation as having been processed. The output
     84     // of the operation now being known, this may make new operations to be
     85     // able to run.  Call cb for each one of them.
     86     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
     87 
     88 private:
     89     const ModelBuilder* mModel;
     90     std::multimap<uint32_t, uint32_t> mOperandToOperations;
     91     std::vector<uint32_t> mUnknownInputCount;  // For each operation
     92 };
     93 
     94 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
     95         mModel(model) {
     96     const auto& operations = mModel->getOperations();
     97     mUnknownInputCount.resize(operations.size());
     98     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
     99         const Operation& operation = operations[operationIndex];
    100         uint32_t count = 0;
    101         for (uint32_t operandIndex : operation.inputs) {
    102             auto lifetime = mModel->getOperand(operandIndex).lifetime;
    103             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
    104                 lifetime == OperandLifeTime::MODEL_OUTPUT) {
    105                 count++;
    106                 mOperandToOperations.insert(
    107                         std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
    108             }
    109         }
    110         if (count == 0) {
    111             cb(operationIndex);
    112         }
    113         mUnknownInputCount[operationIndex] = count;
    114     }
    115 }
    116 
    117 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
    118     // Mark all its outputs as known.
    119     const Operation& operation = mModel->getOperations()[operationIndex];
    120     for (uint32_t operandIndex : operation.outputs) {
    121         auto range = mOperandToOperations.equal_range(operandIndex);
    122         for (auto i = range.first; i != range.second; i++) {
    123             uint32_t& count = mUnknownInputCount[i->second];
    124             if (--count == 0) {
    125                 cb(i->second);
    126             }
    127         }
    128     }
    129 }
    130 
    131 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
    132                              std::shared_ptr<Device> device)
    133         : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device) {}
    134 
    135 // Adds an operand if it has not been added already.
    136 // Sets the index in the submodel for the corresponding operand.
    137 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
    138                               const ModelBuilder& fromModel, OperandKind kind) {
    139     // Have we added this operand already?
    140     auto i = mOperandMap.find(fromOperandIndex);
    141     if (i != mOperandMap.end()) {
    142         nnAssert(kind == INPUT);
    143         *toOperandIndex = i->second;
    144         return ANEURALNETWORKS_NO_ERROR;
    145     }
    146 
    147     // First time we add this operand.
    148     *toOperandIndex = mSubModel.operandCount();
    149     mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
    150 
    151     // Add the operand to the submodel.
    152     const Operand& operand = fromModel.getOperand(fromOperandIndex);
    153     ANeuralNetworksOperandType type = {
    154         .type = static_cast<int32_t>(operand.type),
    155         .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
    156         .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
    157         .scale = operand.scale,
    158         .zeroPoint = operand.zeroPoint
    159     };
    160     int n = mSubModel.addOperand(type);
    161     if (n != ANEURALNETWORKS_NO_ERROR) {
    162         LOG(ERROR) << "Previous error occurred when partitioning the graph";
    163         return n;
    164     }
    165 
    166     // Sets its value.
    167     switch (operand.lifetime) {
    168         case OperandLifeTime::CONSTANT_COPY: {
    169             const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
    170             n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
    171             if (n != ANEURALNETWORKS_NO_ERROR) {
    172                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
    173                 return n;
    174             }
    175         } break;
    176         case OperandLifeTime::CONSTANT_REFERENCE: {
    177             const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
    178             n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory,
    179                                                      operand.location.offset,
    180                                                      operand.location.length);
    181             if (n != ANEURALNETWORKS_NO_ERROR) {
    182                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
    183                 return n;
    184             }
    185         } break;
    186         case OperandLifeTime::NO_VALUE: {
    187             n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
    188             if (n != ANEURALNETWORKS_NO_ERROR) {
    189                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
    190                 return n;
    191             }
    192         } break;
    193         case OperandLifeTime::TEMPORARY_VARIABLE:  // handled similarly to MODEL_OUTPUT
    194             if (kind == INPUT) {
    195                 // The first time we've seen this operand is as an
    196                 // input.  That means it must be defined by a
    197                 // different partition, and is an input to this one.
    198                 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    199             } else {
    200                 // The first time we've seen this operand is as an
    201                 // output.  It may be an input to a different
    202                 // partition, so keep track of it.
    203                 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
    204             }
    205             break;
    206         case OperandLifeTime::MODEL_INPUT:
    207             mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    208             break;
    209         case OperandLifeTime::MODEL_OUTPUT:  // handled similarly to TEMPORARY_VARIABLE
    210             if (kind == INPUT) {
    211                 // The first time we've seen this operand is as an
    212                 // input.  That means it must be defined by a
    213                 // different partition, and is an input to this one.
    214                 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    215             } else {
    216                 // The first time we've seen this operand is as an
    217                 // output.
    218                 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
    219             }
    220             break;
    221         default:
    222             nnAssert(false);
    223             break;
    224     }
    225 
    226     return ANEURALNETWORKS_NO_ERROR;
    227 }
    228 
    229 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
    230     const Operation& operation = fromModel.getOperation(operationIndex);
    231 
    232     // Convert the input and output operand indexes.
    233     //
    234     // We expect operations to be added in topological order.  Therefore:
    235     //
    236     // - We may not have seen an input if it is a model input, a
    237     //   constant, or an operand written by a different partition.
    238     //
    239     // - We should not have seen any outputs.
    240     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
    241     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
    242     std::vector<uint32_t> inputs(inputCount);
    243     std::vector<uint32_t> outputs(outputCount);
    244 
    245     auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
    246                                           std::vector<uint32_t>& localOperands,
    247                                           OperandKind kind) -> int {
    248         const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
    249         for (uint32_t i = 0; i < operandCount; i++) {
    250             uint32_t localOperand = ~0U;
    251             int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
    252             if (n != ANEURALNETWORKS_NO_ERROR)
    253                 return n;
    254             localOperands[i] = localOperand;
    255         }
    256         return ANEURALNETWORKS_NO_ERROR;
    257     };
    258 
    259     int n;
    260     if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
    261         (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
    262         return n;
    263     }
    264 
    265     return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
    266                                    outputCount, outputs.data());
    267 }
    268 
    269 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
    270     for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
    271         stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
    272     }
    273     for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
    274         stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
    275     }
    276 }
    277 
    278 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
    279     for (const auto& step : mSteps) {
    280         for (const auto& input : step->getTempsAsSubModelInputs()) {
    281             const uint32_t fromModelIndex = input.first;
    282             const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
    283             nnAssert(it != mTemporaryToDefiningStep.end());
    284             const uint32_t stepIndex = it->second;
    285             nnAssert(stepIndex < mSteps.size());
    286             mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
    287         }
    288     }
    289 }
    290 
    291 void ExecutionStep::logSubModel() const {
    292     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
    293 
    294     auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) {
    295         if (!toLog.empty()) {
    296             toLog += ", ";
    297         }
    298         toLog += "(";
    299         toLog += std::to_string(e.first);
    300         toLog += "->";
    301         toLog += std::to_string(e.second);
    302         toLog += ")";
    303     };
    304 
    305     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
    306         std::string toLog;
    307         for (const auto& e : map) {
    308             logRemapEntry(toLog, e);
    309         }
    310         VLOG(COMPILATION) << name << ": " << toLog;
    311     };
    312     auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
    313         std::string toLog;
    314         for (const auto& e : set) {
    315             logRemapEntry(toLog, e);
    316         }
    317         VLOG(COMPILATION) << name << ": " << toLog;
    318     };
    319 
    320     logRemapVector("model inputs", mModelInputs);
    321     logRemapVector("model outputs", mModelOutputs);
    322     logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
    323     logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
    324     logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
    325 }
    326 
    327 static void convertModelInputsOrOutputs(
    328         // IN: mModel{Inputs|Outputs}
    329         const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
    330         // IN: fromModel->{input|output}Count()
    331         uint32_t                              fromModelInputOrOutputCount,
    332         // IN: fromModel->get{Input|Output}OperandIndex
    333         std::function<uint32_t(uint32_t)>     fromModelGetInputOrOutputOperandIndex,
    334         // OUT: for v : mModel{Inputs|Outputs} : v.second
    335         std::vector<uint32_t>*                inputsOrOutputs,
    336         // OUT: submodel input-or-output index to original model input-or-output index
    337         std::vector<uint32_t>*                inputOrOutputIndexSubModelToFromModel) {
    338     std::map<uint32_t, uint32_t> fromModelIndexMap;  // operand index to input-or-output index
    339     for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
    340         fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
    341     }
    342     for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
    343         inputsOrOutputs->push_back(myInputOrOutput.second);
    344         const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
    345         inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
    346     }
    347 }
    348 
    349 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
    350                                   int32_t executionPreference) {
    351     if (VLOG_IS_ON(COMPILATION)) {
    352         logSubModel();
    353     }
    354 
    355     mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());
    356 
    357     // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
    358     // Output order: mModelOutputs, mTempsAsSubModelOutputs
    359     //
    360     // ExecutionPlan::next() depends on these orderings.
    361 
    362     std::vector<uint32_t> inputs;
    363     convertModelInputsOrOutputs(mModelInputs,
    364                                 fromModel->inputCount(),
    365                                 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
    366                                 &inputs,
    367                                 &mInputIndexSubModelToFromModel);
    368     for (const auto& subModelInput : mTempsAsSubModelInputs) {
    369         inputs.push_back(subModelInput.second);
    370     }
    371     for (const auto& subModelInput : mOutputsAsSubModelInputs) {
    372         inputs.push_back(subModelInput.second);
    373     }
    374 
    375     std::vector<uint32_t> outputs;
    376     convertModelInputsOrOutputs(mModelOutputs,
    377                                 fromModel->outputCount(),
    378                                 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
    379                                 &outputs,
    380                                 &mOutputIndexSubModelToFromModel);
    381     for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
    382         outputs.push_back(subModelOutput.second);
    383         const Operand& operand = mSubModel.getOperand(subModelOutput.second);
    384         for (uint32_t dimension : operand.dimensions) {
    385             if (dimension == 0) {
    386                 *hasOutputOfUnknownSize = true;
    387                 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
    388                                 << " of original graph) has unknown size: "
    389                                 << toString(operand);
    390                 break;
    391             }
    392         }
    393     }
    394 
    395     {
    396         int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
    397         if (n != ANEURALNETWORKS_NO_ERROR) {
    398             return n;
    399         }
    400         n = mSubModel.finish();
    401         if (n != ANEURALNETWORKS_NO_ERROR) {
    402             return n;
    403         }
    404     }
    405 
    406     {
    407         // Compute mOutputsAsSubModelInputsIndexToFromModel.
    408 
    409         std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
    410         for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
    411             fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
    412         }
    413 
    414         for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
    415             const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
    416             const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
    417             if (it == fromModelOperandIndexToOutputIndex.end()) {
    418                 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
    419                            << " in main model output operand list";
    420                 return ANEURALNETWORKS_BAD_STATE;
    421             }
    422             mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
    423         }
    424     }
    425 
    426     // TODO: Move compilation elsewhere?
    427 
    428     if (mDevice == nullptr) {
    429         return ANEURALNETWORKS_NO_ERROR;
    430     }
    431 
    432     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation";
    433     return compile(mDevice, &mSubModel, executionPreference, &mPreparedSubModel);
    434 }
    435 
    436 void ExecutionStep::dump() const {
    437     Model model;
    438     mSubModel.setHidlModel(&model);
    439     if (VLOG_IS_ON(COMPILATION)) {
    440         VLOG(COMPILATION) << "ExecutionStep#" << mIndex
    441                           << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
    442         logModelToInfo(model);
    443     }
    444 }
    445 
    446 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
    447                                         int32_t executionPreference) {
    448     findTempsAsSubModelOutputs();
    449     for (const auto& step : mSteps) {
    450         int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
    451                                      executionPreference);
    452         if (n != ANEURALNETWORKS_NO_ERROR) {
    453             VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
    454             return n;
    455         }
    456     }
    457     if (mHasSubModelOutputOfUnknownSize) {
    458         VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
    459         return ANEURALNETWORKS_OP_FAILED;
    460     }
    461 
    462     mSuccessfulFinish = true;
    463     return ANEURALNETWORKS_NO_ERROR;
    464 }
    465 
    466 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
    467                                       int32_t executionPreference) {
    468     if (mDevice == nullptr) {
    469         mSuccessfulFinish = true;
    470         return ANEURALNETWORKS_NO_ERROR;
    471     }
    472 
    473     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
    474     const int n = compile(mDevice, mModel, executionPreference, &mPreparedModel);
    475     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
    476     return n;
    477 }
    478 
    479 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
    480     nnAssert(mBody != nullptr);
    481     return mBody->finish(fromModel, executionPreference);
    482 }
    483 
    484 ExecutionPlan::Controller::Controller(
    485     const ExecutionPlan* plan,
    486     const ExecutionBuilder* executionBuilder,
    487     std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
    488     uint32_t totalSizeOfTemporaries) :
    489         mPlan(plan), mExecutionBuilder(executionBuilder),
    490         mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) {
    491     if (totalSizeOfTemporaries) {
    492         if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
    493             LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
    494             mNextStepIndex = kBadStepIndex;
    495         }
    496     }
    497 }
    498 
    499 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
    500     const ExecutionBuilder* executionBuilder) const {
    501     nnAssert((mState == EMPTY) == (mBody == nullptr));
    502     if (mBody && !mBody->mSuccessfulFinish) {
    503         VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish";
    504         return std::shared_ptr<Controller>(nullptr);
    505     }
    506 
    507     // Create the layout for a Memory object big enough for to hold
    508     // every TEMPORARY in the original model that is live across
    509     // partition boundaries.
    510     //
    511     // TODO: Rethink this approach for managing temporaries.  Some
    512     // alternatives:
    513     //
    514     // 1) Adopt a memory layout scheme analogous to stack allocation,
    515     // where objects of non-overlapping lifetime can occupy the same
    516     // storage.  We would still have a single Memory object in this
    517     // case.
    518     //
    519     // 2) Do something like what CpuExecutor does, and do allocations
    520     // and deallocations on the fly (during execution) before first
    521     // reference and after last reference, respectively.  This would
    522     // mean having one Memory object per TEMPORARY; or, in a more
    523     // complicated implementation, one Memory object per set of
    524     // temporaries that have the same lifetime.  Note that the Android
    525     // system limits the number of shared memory objects, which are
    526     // what our Memory objects represent.
    527     //
    528     uint32_t totalSizeOfTemporaries = 0;
    529     std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
    530     if (mState == COMPOUND) {
    531         const ModelBuilder* fromModel = executionBuilder->getModel();
    532         for (const auto& step : compound()->mSteps) {
    533             for (const auto& output: step->getTempsAsSubModelOutputs()) {
    534                 const uint32_t fromModelOperandIndex = output.first;
    535                 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
    536                 if (subModelInputsAndOutputs == nullptr) {
    537                     subModelInputsAndOutputs =
    538                             std::make_shared<Controller::SubModelInputsAndOutputsType>();
    539                 }
    540                 const uint32_t size = sizeOfData(fromModelOperand);
    541                 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
    542                 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
    543                 totalSizeOfTemporaries += size;
    544             }
    545         }
    546         if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
    547             for (const auto& io : *subModelInputsAndOutputs) {
    548                 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first
    549                                 << ", offset = " << io.second;
    550             }
    551         }
    552     }
    553 
    554     return std::shared_ptr<Controller>(new Controller(this, executionBuilder,
    555                                                       subModelInputsAndOutputs,
    556                                                       totalSizeOfTemporaries));
    557 }
    558 
    559 
    560 // TODO: Find a better way to provide this functionality.
    561 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
    562                             std::shared_ptr<StepExecutor>* executor) const {
    563     *executor = nullptr;
    564 
    565     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
    566                     << "): mNextStepIndex = " << controller->mNextStepIndex;
    567 
    568     if (controller->mNextStepIndex == 0) {
    569         // We haven't called next().
    570         return ANEURALNETWORKS_OP_FAILED;
    571     }
    572 
    573     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
    574         // The last call to next() did not produce an executor.
    575         return ANEURALNETWORKS_OP_FAILED;
    576     }
    577 
    578     --controller->mNextStepIndex;
    579     return next(controller, executor);
    580 }
    581 
    582 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
    583                         std::shared_ptr<StepExecutor>* executor) const {
    584     *executor = nullptr;
    585 
    586     VLOG(EXECUTION) << "ExecutionPlan::next("
    587                     << SHOW_IF_DEBUG(controller << ", " << executor)
    588                     << "): mNextStepIndex = " << controller->mNextStepIndex;
    589 
    590     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
    591         return ANEURALNETWORKS_OP_FAILED;
    592     }
    593 
    594     if (mState == EMPTY) {
    595         nnAssert(controller->mNextStepIndex == 0);  // end
    596         controller->mNextStepIndex = Controller::kBadStepIndex;
    597         return ANEURALNETWORKS_NO_ERROR;
    598     }
    599 
    600     if (mState == SIMPLE) {
    601         if (controller->mNextStepIndex == 0) {
    602             // First (and only) step.
    603             auto simpleBody = static_cast<const SimpleBody*>(mBody);
    604             *executor = std::make_shared<StepExecutor>(
    605                 controller->mExecutionBuilder,
    606                 simpleBody->mModel,
    607                 (simpleBody->mDevice == nullptr ? nullptr : simpleBody->mDevice->getInterface()),
    608                 simpleBody->mPreparedModel);
    609             (*executor)->mapInputsAndOutputsTrivially();
    610             controller->mNextStepIndex = 1;
    611             return ANEURALNETWORKS_NO_ERROR;
    612         }
    613 
    614         nnAssert(controller->mNextStepIndex == 1);  // end
    615         controller->mNextStepIndex = Controller::kBadStepIndex;
    616         return ANEURALNETWORKS_NO_ERROR;
    617     }
    618 
    619     auto compoundBody = compound();
    620 
    621     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
    622         // end
    623         controller->mNextStepIndex = Controller::kBadStepIndex;
    624         return ANEURALNETWORKS_NO_ERROR;
    625     }
    626 
    627     // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
    628     // Output order: model outputs, temps as submodel outputs
    629     //
    630     // ExecutionStep::finishSubModel() establishes these orderings.
    631 
    632     const auto step = compoundBody->mSteps[controller->mNextStepIndex];
    633     *executor = std::make_shared<StepExecutor>(
    634         controller->mExecutionBuilder,
    635         step->getSubModel(),
    636         (step->getDevice() == nullptr ? nullptr : step->getDevice()->getInterface()),
    637         step->getPreparedSubModel());
    638     step->mapInputsAndOutputs(*executor);
    639     if (controller->mSubModelInputsAndOutputs != nullptr) {
    640         {
    641             // Tell executor about temps as submodel outputs.
    642 
    643             const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
    644             const auto& subModelOutputs = step->getTempsAsSubModelOutputs();
    645 
    646             uint32_t idx = 0;
    647             for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
    648                 const uint32_t fromModelOperandIndex = I->first;
    649                 const uint32_t offsetOfTemporary =
    650                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
    651                 int n = (*executor)->setOutputFromTemporaryMemory(
    652                     firstSubModelOutputIndex + idx,
    653                     &controller->mTemporaries,
    654                     offsetOfTemporary);
    655                 if (n != ANEURALNETWORKS_NO_ERROR) {
    656                     controller->mNextStepIndex = Controller::kBadStepIndex;
    657                     return n;
    658                 }
    659             }
    660         }
    661         {
    662             // Tell executor about temps as submodel inputs.
    663 
    664             const size_t firstSubModelInputIndex = step->getModelInputs().size();
    665             const auto& subModelInputs = step->getTempsAsSubModelInputs();
    666 
    667             uint32_t idx = 0;
    668             for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
    669                 const uint32_t fromModelOperandIndex = I->first;
    670                 const uint32_t offsetOfTemporary =
    671                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
    672                 int n = (*executor)->setInputFromTemporaryMemory(
    673                     firstSubModelInputIndex + idx,
    674                     &controller->mTemporaries,
    675                     offsetOfTemporary);
    676                 if (n != ANEURALNETWORKS_NO_ERROR) {
    677                     controller->mNextStepIndex = Controller::kBadStepIndex;
    678                     return n;
    679                 }
    680             }
    681         }
    682     }
    683     {
    684         // Tell executor about outputs as submodel inputs.
    685 
    686         const size_t firstOutputsAsSubModelInputIndex =
    687                 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
    688         const auto& outputsAsSubModelInputsIndexToFromModel =
    689                 step->getOutputsAsSubModelInputsIndexToFromModel();
    690         for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
    691             uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
    692             (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
    693         }
    694     }
    695 
    696     controller->mNextStepIndex++;
    697     return ANEURALNETWORKS_NO_ERROR;
    698 }
    699 
    700 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
    701     nnAssert(mState != SIMPLE);
    702     if (mState == EMPTY) {
    703         mBody = new CompoundBody();
    704         mState = COMPOUND;
    705     }
    706     auto& steps = compound()->mSteps;
    707     auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
    708     steps.push_back(step);
    709     return step;
    710 }
    711 
    712 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
    713                                      const ModelBuilder* model) {
    714     nnAssert(mState == EMPTY);
    715     mBody = new SimpleBody(device, model);
    716     mState = SIMPLE;
    717 }
    718 
    719 void ExecutionPlan::dump() const {
    720     if (mBody) {
    721         mBody->dump();
    722     } else {
    723         VLOG(COMPILATION) << "EMPTY";
    724     }
    725 }
    726 
    727 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
    728     switch (mState) {
    729         case EMPTY:
    730             return Kind::EMPTY;
    731         case SIMPLE:
    732             nnAssert(mBody);
    733             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
    734         case COMPOUND:
    735             nnAssert(mBody);
    736             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
    737         default:
    738             nnAssert(!"unexpected state");
    739             return Kind::ERROR;
    740     }
    741 }
    742 
    743 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
    744     nnAssert(mState == SIMPLE);
    745     return static_cast<const SimpleBody*>(mBody)->mDevice;
    746 }
    747 
    748 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
    749     return compound()->mSteps;
    750 }
    751 
    752 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
    753     return mBody->hasSubModelOutputsOfUnknownSize();
    754 }
    755 
    756 void ExecutionPlan::SimpleBody::dump() const {
    757     VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
    758 }
    759 
    760 void ExecutionPlan::CompoundBody::dump() const {
    761     for (const auto& step : mSteps) {
    762         step->dump();
    763     }
    764 }
    765 
    766 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
    767                                    uint32_t preference, ExecutionPlan* plan) const {
    768     // This function uses a heuristic approach to partitioning the graph.
    769     // It should be good enough for the first release.
    770 
    771     const size_t nonCpuDeviceCount = devices.size();
    772     // The device count is the number of HAL devices + 1. The +1 is for the CPU.
    773     // Note that deviceCount includes CPU, which has no entry in devices[].
    774     const size_t deviceCount = nonCpuDeviceCount + 1;
    775     const size_t operationCount = mOperations.size();
    776 
    777     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
    778                       << ", operationCount = " << operationCount;
    779 
    780     // If we only have the CPU, or if the graph has no operations, no need to try to partition.
    781     if (nonCpuDeviceCount == 0 || operationCount == 0) {
    782         // Make sure no op is an OEM operation.
    783         for (auto& op: mOperations) {
    784             if (op.type == OperationType::OEM_OPERATION) {
    785                 LOG(ERROR) << "No driver can do the OEM op";
    786                 return ANEURALNETWORKS_BAD_DATA;
    787             }
    788         }
    789         plan->becomeSingleStep(nullptr /* CPU */, this);
    790         return plan->finish(this, preference);
    791     }
    792 
    793     // Figure out where each operation will best execute.
    794     // The value of the vector is the index in the devices vector, with devices.size()
    795     // representing the CPU.
    796     std::vector<int> bestDeviceForOperation(operationCount);
    797     int status = findBestDeviceForEachOperation(preference, devices, deviceCount,
    798                                                 &bestDeviceForOperation);
    799     if (status != ANEURALNETWORKS_NO_ERROR) {
    800         return status;
    801     }
    802 
    803     // If one device will run all the operations, we don't need to split the work.
    804     if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
    805                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
    806         const int bestDeviceIndex = bestDeviceForOperation[0];
    807         const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1);
    808         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
    809                           << bestDeviceIndex << " = "
    810                           << (cpu ? "CPU" : devices[bestDeviceIndex]->getName());
    811         plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this);
    812         return plan->finish(this, preference);
    813     }
    814 
    815     // No easy solution, we need to split the work.
    816 
    817     // We keep track of the operations that are ready to run for each device.
    818     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
    819 
    820     // This helper function enqueues the operation on the appropriate queue.
    821     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
    822         int deviceIndex = bestDeviceForOperation[operationIndex];
    823         perDeviceQueue[deviceIndex].push(operationIndex);
    824         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
    825                           << deviceIndex;
    826     };
    827 
    828     // This helper function finds a device that has operations ready to process.
    829     // We start by looking at the CPU. We do this to try to maximize the
    830     // size of the graph we'll send to non-CPU devices. If the CPU runs first,
    831     // it will have the chance to prepare more of the inputs required by the
    832     // other devices. This function returns -1 if all queues are empty.
    833     auto findNextDeviceToProcess = [&]() -> int {
    834         for (int i = deviceCount - 1; i >= 0; i--) {
    835             if (!perDeviceQueue[i].empty()) {
    836                 return i;
    837             }
    838         }
    839         return -1;
    840     };
    841 
    842     OperandTracker tracker(this, enqueueOnAppropriateDevice);
    843     // For each iteration of this loop, we'll create an execution step.
    844     while (true) {
    845         // Find the device we'll do this step for.
    846         int deviceIndex = findNextDeviceToProcess();
    847         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
    848         if (deviceIndex < 0) {
    849             break;
    850         }
    851         // nullptr represents the CPU.
    852         std::shared_ptr<Device> device =
    853                 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount
    854                         ? devices[deviceIndex] : nullptr;
    855 
    856         // Assign as much as possible to this device.
    857         std::shared_ptr<ExecutionStep> step = plan->createNewStep(device);
    858         auto& queue = perDeviceQueue[deviceIndex];
    859         while (!queue.empty()) {
    860             uint32_t operationIndex = queue.front();
    861             queue.pop();
    862             int n = step->addOperation(operationIndex, *this);
    863             if (n != ANEURALNETWORKS_NO_ERROR) {
    864                 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
    865                 return n;
    866             }
    867             tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
    868         }
    869     }
    870 
    871     int n = plan->finish(this, preference);
    872     if (VLOG_IS_ON(COMPILATION)) {
    873         Model model;
    874         setHidlModel(&model);
    875         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
    876         logModelToInfo(model);
    877         plan->dump();
    878     }
    879     return n;
    880 }
    881 
    882 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
    883                                                  uint32_t operationIndex) const {
    884     const Operation& operation = getOperation(operationIndex);
    885     // TODO This assumes that the type is dictated by the first operand. This is
    886     // currently the case but is not a safe assumption to make in the long term.
    887     const uint32_t operandIndex = operation.inputs[0];
    888     const OperandType operandType = mOperands[operandIndex].type;
    889     switch(operandType) {
    890         case OperandType::FLOAT32:
    891         case OperandType::TENSOR_FLOAT32:
    892             if (mRelaxComputationFloat32toFloat16) {
    893                 return device->getRelaxedFloat32toFloat16Performance();
    894             } else {
    895                 return device->getFloat32Performance();
    896             }
    897         case OperandType::INT32:
    898         case OperandType::UINT32:
    899         case OperandType::TENSOR_INT32:
    900         case OperandType::TENSOR_QUANT8_ASYMM:
    901             // For OEM, the real selection will be made from who can run the operand.
    902         case OperandType::OEM:
    903         case OperandType::TENSOR_OEM_BYTE:
    904             return device->getQuantized8Performance();
    905         default:
    906             nnAssert(false);
    907             return device->getQuantized8Performance();
    908     }
    909 }
    910 
    911 namespace {
    912 // This class determines whether a given device can execute a given operation
    913 class CanDo {
    914 public:
    915     CanDo() {}
    916 
    917     void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) {
    918         Model hidlModel;
    919         model->setHidlModel(&hidlModel);
    920         device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex);
    921     }
    922 
    923     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
    924 
    925 private:
    926     hidl_vec<bool> mSupportsOperationByIndex;
    927 };
    928 };  // anonymous namespace
    929 
    930 int ModelBuilder::findBestDeviceForEachOperation(
    931         uint32_t preference,
    932         const std::vector<std::shared_ptr<Device>>& devices,
    933         const size_t deviceCount,
    934         std::vector<int>* bestDeviceForOperation) const {
    935 
    936     // Note that deviceCount includes CPU, which has no entry in devices[]
    937     const size_t nonCpuDeviceCount = deviceCount - 1;
    938 
    939     std::vector<CanDo> canDo(nonCpuDeviceCount);
    940     for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
    941         canDo[deviceIndex].initialize(this, devices[deviceIndex]);
    942     }
    943 
    944     // Figure out the best driver for each operation.
    945     const size_t operationCount = mOperations.size();
    946     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
    947         // Find which non-CPU device gives the best performance for this operation.
    948         int bestChoice = -1;
    949         float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
    950         for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
    951             const auto& device = devices[deviceIndex];
    952             if (canDo[deviceIndex].check(operationIndex)) {
    953                 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
    954                 const float perfVal =
    955                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
    956                                                                             : perf.execTime);
    957                 if (bestChoice < 0 || perfVal < bestPerfVal) {
    958                     bestChoice = deviceIndex;
    959                     bestPerfVal = perfVal;
    960                 }
    961             } else {
    962                 // Somewhat noisy logging, but only place where the user of
    963                 // NNAPI can get feedback on why an operation was not run on a
    964                 // specific device.
    965                 // Logs O(operationCount * nonCpuDeviceCount) times, but
    966                 // typically nonCpuDeviceCount is very small.
    967                 VLOG(COMPILATION) << "Device " << device->getName()
    968                                   << " can't do operation "
    969                                   << toString(getOperation(operationIndex).type);
    970             }
    971         }
    972         // If it's the OEM op, we'd better have a device able to do it.
    973         if (mOperations[operationIndex].type == OperationType::OEM_OPERATION) {
    974             if (bestChoice < 0) {
    975                 LOG(ERROR) << "No driver can do the OEM op";
    976                 return ANEURALNETWORKS_BAD_DATA;
    977             }
    978         } else {
    979             // If no driver has been found, or if the best driver is not better than the CPU,
    980             // prefer the CPU. Since the performance is a ratio compared to the CPU performance,
    981             // by definition the performance of the CPU is 1.0.
    982             if (bestChoice < 0 || bestPerfVal >= 1.0) {
    983                 bestChoice = nonCpuDeviceCount;  // The ID of the CPU.
    984             }
    985         }
    986 
    987         (*bestDeviceForOperation)[operationIndex] = bestChoice;
    988         VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
    989                           << toString(getOperation(operationIndex).type)
    990                           << ") = "
    991                           << (*bestDeviceForOperation)[operationIndex];
    992     }
    993     return ANEURALNETWORKS_NO_ERROR;
    994 }
    995 
    996 } // namespace nn
    997 } // namespace android
    998