Home | History | Annotate | Download | only in test
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "CompilationBuilder.h"
     18 #include "ExecutionPlan.h"
     19 #include "HalInterfaces.h"
     20 #include "Manager.h"
     21 #include "ModelBuilder.h"
     22 #include "NeuralNetworks.h"
     23 #include "NeuralNetworksOEM.h"
     24 #include "SampleDriver.h"
     25 #include "TestNeuralNetworksWrapper.h"
     26 #include "Utils.h"
     27 #include "ValidateHal.h"
     28 
     29 #include <gtest/gtest.h>
     30 
     31 #include <filesystem>
     32 #include <functional>
     33 #include <map>
     34 #include <queue>
     35 #include <type_traits>
     36 
     37 // Uncomment the following line to generate some debugging output that
     38 // may be useful when analyzing failures:
     39 //
     40 // #define VERBOSE VERBOSE
     41 
     42 // These tests do whitebox testing of the graph partitioning
     43 // algorithm.  It is "whitebox" in the sense that we're not evaluating
     44 // whether a particular partitioning is legal, or "good enough"
     45 // according to some metric, but whether it exactly matches the
     46 // expected behavior of the current partitioning algorithm.
     47 //
     48 // A key part of the current partitioning algorithm is to determine
     49 // which device among the available devices should be the one to
     50 // execute a particular operation from the graph.  This determination
     51 // is made "locally" -- i.e., it does not depend on the graph
     52 // topology, only on the properties of the operation in question.
     53 // IDevice::getSupportedOperations() indicates which operations in a
     54 // graph can be executed on a device, and IDevice::getCapabilities()
     55 // indicates how "good" that device is for executing particular kinds
     56 // of operations.  For each operation, the partitioning algorithm
     57 // picks the "best" device that is capable of executing that
     58 // operation; if no device can do so, then the algorithm picks the
     59 // cpu.
     60 //
     61 // As part of this testing approach, we want to make it easy to
     62 // specify which operations in a test graph can be executed on which
     63 // devices.  We accomplish this in the following way:
     64 // - A unary OEM operation is available.
     65 // - There is a collection of operations (each of which has two inputs
     66 //   and one output):
     67 //   - Eight kinds of operations available at driver version V1_0 or
     68 //     later.  They are represented in the graph as ADD or MUL with a
     69 //     particular activation function -- two opcodes times four
     70 //     activation functions means eight available operation kinds.
     71 //     This is a low-level representation detail -- when we specify the
     72 //     behavior of the device or build a graph, we do so in terms of
     73 //     operation encodings 0..7.
     74 //   - Eight kinds of operations available at driver version V1_1 or
     75 //     later.  They are represented in the graph as DIV or SUB with
     76 //     a particular activation function, exactly analogous to ADD
     77 //     and MUL above.  We use operation encodings 8..15 for them.
     78 //   - Four kinds of operations available at driver version V1_2 or
     79 //     later.  They are represented in the graph as MAXIMUM,
     80 //     MINIMUM, POW, or PRELU.  These operations take no activation
     81 //     function, so we only get 4 operation kinds, for which we
     82 //     use operation encodings 16..19.
     83 // When we instantiate a device for testing purposes, we specify what subset of
     84 // those operations the device is able to execute.
     85 //
     86 // In order to determine whether or not a partitioning matches the
     87 // expected partitioning, we check the number of partitions, check
     88 // which device each partition targets, and compare each partition's
     89 // subgraph, model inputs, model outputs, submodel inputs, and
     90 // submodel outputs against what is expected.  In order to perform
     91 // that comparison, we build a model to compare against a partition's
     92 // submodel and run a graph comparison algorithm on it.  The graph
     93 // comparison and the inputs and outputs comparisons are syntactic
     94 // rather than semantic comparisons -- they don't allow for
     95 // reorderings of inputs and outputs.  Because of this, we need to
     96 // know exactly how the partitioning algorithm orders inputs and
     97 // outputs in order to construct the models and operand lists to
     98 // compare against.  Here are some relevant behaviors of the
     99 // partitioning algorithm:
    100 //
    101 // - It builds a subgraph by walking operations in forward topological
    102 //   order, and adding each operation's input operands and output
    103 //   operands in index order (input followed by output) when that
    104 //   operation is added.  (It does not add an input that has already
    105 //   been added.)
    106 // - It finds model inputs, model outputs, and submodel inputs in
    107 //   the order the corresponding operands were added to the subgraph
    108 //   (see ExecutionStep methods getModelInputs(), getModelOutputs(),
    109 //   getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()).
    110 // - It finds temps as submodel outputs in numerical order of corresponding
    111 //   operand number in the original model (see ExecutionStep method
    112 //   getTempsAsSubModelOutputs()).
    113 // - When it calls identifyInputsAndOutputs() on the submodel, it
    114 //   passes inputs from getModelInputs() in order, followed by temps as
    115 //   submodel inputs from getTempsAsSubModelInputs() in order,
    116 //   followed by outputs as submodel inputs from
    117 //   getOutputsAsSubModelInputs() in order; and it passes outputs from
    118 //   getModelOutputs() in order followed by submodel outputs from
    119 //   getTempsAsSubModelOutputs() in order.
    120 //
    121 // TODO: Maybe the logic for comparing a partition to an expected
    122 //       model should be changed to tolerate reorderings of inputs and
    123 //       outputs, so that when we build models and lists to compare
    124 //       against, we don't need to worry about input and output
    125 //       orderings.  But is there a way to do this that still lets us
    126 //       verify that we have the correct relationships between
    127 //       an (original) model's inputs and outputs and each submodel's
    128 //       inputs and outputs, as well as the correct relationship
    129 //       between submodel inputs and outputs across partitions?
    130 
    131 namespace {
    132 
    133 const Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
    134 
    135 using CompilationBuilder = ::android::nn::CompilationBuilder;
    136 using Device = ::android::nn::Device;
    137 using DeviceManager = ::android::nn::DeviceManager;
    138 using ExecutePreference = ::android::nn::test_wrapper::ExecutePreference;
    139 using ExecutionPlan = ::android::nn::ExecutionPlan;
    140 using ExecutionStep = ::android::nn::ExecutionStep;
    141 using HalVersion = ::android::nn::HalVersion;
    142 using HidlModel = ::android::hardware::neuralnetworks::V1_2::Model;
    143 using HidlToken =
    144         ::android::hardware::hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>;
    145 using ModelBuilder = ::android::nn::ModelBuilder;
    146 using Result = ::android::nn::test_wrapper::Result;
    147 using SampleDriver = ::android::nn::sample_driver::SampleDriver;
    148 using WrapperSymmPerChannelQuantParams = ::android::nn::test_wrapper::SymmPerChannelQuantParams;
    149 using WrapperCompilation = ::android::nn::test_wrapper::Compilation;
    150 using WrapperModel = ::android::nn::test_wrapper::Model;
    151 using WrapperOperandType = ::android::nn::test_wrapper::OperandType;
    152 using WrapperType = ::android::nn::test_wrapper::Type;
    153 
    154 template <typename T> using sp = ::android::sp<T>;
    155 template <typename T>
    156 using MQDescriptorSync = ::android::hardware::MQDescriptorSync<T>;
    157 
    158 Capabilities makeCapabilities(float perf) {
    159     PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
    160     return {.relaxedFloat32toFloat16PerformanceScalar = perfInfo,
    161             .relaxedFloat32toFloat16PerformanceTensor = perfInfo,
    162             .operandPerformance = ::android::nn::nonExtensionOperandPerformance(perfInfo)};
    163 };
    164 
    165 void update(Capabilities* capabilities, OperandType type, float perf) {
    166     PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
    167     ::android::nn::update(&capabilities->operandPerformance, type, perfInfo);
    168 }
    169 
    170 float lookupExecTime(const Capabilities& capabilities, OperandType type) {
    171     return ::android::nn::lookup(capabilities.operandPerformance, type).execTime;
    172 }
    173 
    174 const uint32_t kNumFuseCodes = 4;
    175 const uint32_t kBadOperation = ~0;
    176 
    177 // V1_0 operations
    178 const uint32_t kFirstEncodingADD = 0;
    179 const uint32_t kFirstEncodingMUL = kFirstEncodingADD + kNumFuseCodes;
    180 const uint32_t kFirstEncodingV1_0 = kFirstEncodingADD;
    181 const uint32_t kLastEncodingV1_0 = kFirstEncodingMUL + kNumFuseCodes - 1;
    182 
    183 // V1_1 operations
    184 const uint32_t kFirstEncodingDIV = kLastEncodingV1_0 + 1;
    185 const uint32_t kFirstEncodingSUB = kFirstEncodingDIV + kNumFuseCodes;
    186 const uint32_t kFirstEncodingV1_1 = kFirstEncodingDIV;
    187 const uint32_t kLastEncodingV1_1 = kFirstEncodingSUB + kNumFuseCodes - 1;
    188 
    189 // V1_2 operations
    190 const uint32_t kFirstEncodingMAXIMUM = kLastEncodingV1_1 + 1;
    191 const uint32_t kFirstEncodingMINIMUM = kFirstEncodingMAXIMUM + 1;
    192 const uint32_t kFirstEncodingPOW = kFirstEncodingMINIMUM + 1;
    193 const uint32_t kFirstEncodingPRELU = kFirstEncodingPOW + 1;
    194 const uint32_t kFirstEncodingV1_2 = kFirstEncodingMAXIMUM;
    195 const uint32_t kLastEncodingV1_2 = kFirstEncodingPRELU;
    196 
    197 const std::map<OperationType, uint32_t> operationToFirstEncoding = {
    198         {OperationType::ADD, kFirstEncodingADD},
    199         {OperationType::MUL, kFirstEncodingMUL},
    200         {OperationType::DIV, kFirstEncodingDIV},
    201         {OperationType::SUB, kFirstEncodingSUB},
    202         {OperationType::MAXIMUM, kFirstEncodingMAXIMUM},
    203         {OperationType::MINIMUM, kFirstEncodingMINIMUM},
    204         {OperationType::POW, kFirstEncodingPOW},
    205         {OperationType::PRELU, kFirstEncodingPRELU},
    206 };
    207 
    208 // Sorted in reverse order (std::greater) so that we can use map::lower_bound to
    209 // find an entry whose key is numerically less than or equal to a search value.
    210 // mapped_type is (OperandCode, hasFuseCode).
    211 const std::map<uint32_t, std::pair<uint32_t, bool>, std::greater<>> firstEncodingToOperation = {
    212         {kFirstEncodingADD, {ANEURALNETWORKS_ADD, true}},
    213         {kFirstEncodingMUL, {ANEURALNETWORKS_MUL, true}},
    214         {kFirstEncodingDIV, {ANEURALNETWORKS_DIV, true}},
    215         {kFirstEncodingSUB, {ANEURALNETWORKS_SUB, true}},
    216         {kFirstEncodingMAXIMUM, {ANEURALNETWORKS_MAXIMUM, false}},
    217         {kFirstEncodingMINIMUM, {ANEURALNETWORKS_MINIMUM, false}},
    218         {kFirstEncodingPOW, {ANEURALNETWORKS_POW, false}},
    219         {kFirstEncodingPRELU, {ANEURALNETWORKS_PRELU, false}},
    220 };
    221 
    222 // Look up the operation with the specified index in a graph, and return the
    223 // operation encoding; or, if for some reason this is not one of the encoded
    224 // operations, then return kBadOperation.
    225 uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation,
    226                          std::function<const Operand&(uint32_t)> getOperand,
    227                          std::function<const uint8_t*(uint32_t)> getValue,
    228                          uint32_t operationIndex) {
    229     const Operation& operation = getOperation(operationIndex);
    230     switch (operation.type) {
    231         case OperationType::ADD:
    232         case OperationType::MUL:
    233         case OperationType::DIV:
    234         case OperationType::SUB: {
    235             // input2 is the fused activation function
    236             const Operand& input2 = getOperand(operation.inputs[2]);
    237             if ((input2.type == OperandType::INT32) &&
    238                 (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) {
    239                 int32_t value;
    240                 CHECK_EQ(sizeof(value), input2.location.length);
    241                 memcpy(&value,
    242                        getValue(input2.location.offset),
    243                        input2.location.length);
    244                 return value + operationToFirstEncoding.at(operation.type);
    245             }
    246             break;
    247         }
    248         default: {
    249             auto it = operationToFirstEncoding.find(operation.type);
    250             if (it != operationToFirstEncoding.end()) {
    251                 return it->second;
    252             }
    253             break;
    254         }
    255     }
    256     return kBadOperation;
    257 }
    258 
    259 uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) {
    260     return lookupOperation(
    261         [&model](uint32_t index) -> const Operation& {
    262             return model.operations[index];
    263         },
    264         [&model](uint32_t index) -> const Operand& {
    265             return model.operands[index];
    266         },
    267         [&model](uint32_t offset) {return &model.operandValues[offset];},
    268         operationIndex);
    269 }
    270 
    271 #ifdef VERBOSE
    272 // This is a debugging utility function
    273 void dump(const char* name, const ModelBuilder* model) {
    274     HidlModel hidlModel;
    275     model->setHidlModel(&hidlModel);
    276     std::cout << name << ": " << toString(hidlModel) << std::endl;
    277     std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl;
    278     std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl;
    279     for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) {
    280         std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl;
    281     }
    282 }
    283 #endif
    284 
    285 // This is an IDevice for testing purposes.  It only has a few
    286 // interesting properties, all of which are specified as constructor
    287 // arguments: device capabilities; which subset of operation kinds
    288 // (0..19) does the device support; does the device support the OEM
    289 // operation.  The subset is represented with a bitmask, in which
    290 // operation kind K corresponds to the bit (1 << K).
    291 class PartitioningDriver : public SampleDriver {
    292 private:
    293     // Dummy class -- a prepared model must not be nullptr.
    294     class PartitioningPreparedModel : public IPreparedModel {
    295     public:
    296      Return<ErrorStatus> execute(const Request&, const sp<V1_0::IExecutionCallback>&) override {
    297          return ErrorStatus::DEVICE_UNAVAILABLE;
    298      }
    299      Return<ErrorStatus> execute_1_2(const Request&, MeasureTiming,
    300                                      const sp<V1_2::IExecutionCallback>&) override {
    301          return ErrorStatus::DEVICE_UNAVAILABLE;
    302      }
    303      Return<void> executeSynchronously(const Request&, MeasureTiming,
    304                                        executeSynchronously_cb cb) override {
    305          cb(ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
    306          return Void();
    307      }
    308      Return<void> configureExecutionBurst(
    309              const sp<V1_2::IBurstCallback>& /*callback*/,
    310              const MQDescriptorSync<V1_2::FmqRequestDatum>& /*requestChannel*/,
    311              const MQDescriptorSync<V1_2::FmqResultDatum>& /*resultChannel*/,
    312              configureExecutionBurst_cb cb) override {
    313          cb(ErrorStatus::DEVICE_UNAVAILABLE, nullptr);
    314          return Void();
    315      }
    316     };
    317 public:
    318     enum OEM {
    319         OEMNo,          // rejected by getSupportedOperations and prepareModel
    320         OEMIndecisive,  // accepted by getSupportedOperations but not prepareModel
    321         OEMYes,         // accepted by getSupportedOperations and prepareModel
    322     };
    323 
    324     PartitioningDriver(const char* name, const char* version, Capabilities capabilities,
    325                        uint32_t operationMask, OEM oem = OEMNo)
    326         : SampleDriver(name),
    327           mVersionString(version),
    328           mCapabilities(capabilities),
    329           mOperationMask(operationMask),
    330           mOEM(oem) {}
    331     ~PartitioningDriver() override {}
    332 
    333     Return<void> getVersionString(getVersionString_cb cb) override {
    334         cb(ErrorStatus::NONE, mVersionString);
    335         return Void();
    336     }
    337 
    338     Return<ErrorStatus> prepareModel_1_2(const Model& model, ExecutionPreference,
    339                                          const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&,
    340                                          const HidlToken&,
    341                                          const sp<IPreparedModelCallback>& cb) override {
    342         ErrorStatus status = ErrorStatus::NONE;
    343         if (mOEM != OEMYes) {
    344             for (const auto& operation : model.operations) {
    345                 if (operation.type == OperationType::OEM_OPERATION) {
    346                     status = ErrorStatus::INVALID_ARGUMENT;
    347                     break;
    348                 }
    349             }
    350         }
    351         cb->notify_1_2(status, new PartitioningPreparedModel);
    352         return status;
    353     }
    354 
    355     Return<DeviceStatus> getStatus() override {
    356         return DeviceStatus::AVAILABLE;
    357     }
    358 
    359     Return<void> getCapabilities_1_2(getCapabilities_1_2_cb cb) override {
    360         cb(ErrorStatus::NONE, mCapabilities);
    361         return Void();
    362     }
    363 
    364     Return<void> getSupportedOperations_1_2(const Model& model,
    365                                             getSupportedOperations_cb cb) override {
    366         if (!android::nn::validateModel(model)) {
    367             cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>());
    368             return Void();
    369         }
    370 
    371         const size_t count = model.operations.size();
    372         std::vector<bool> supported(count);
    373         for (size_t i = 0; i < count; i++) {
    374             if (model.operations[i].type == OperationType::OEM_OPERATION) {
    375                 supported[i] = (mOEM != OEMNo);
    376                 continue;
    377             }
    378             supported[i] = false;
    379             uint32_t operation = lookupOperation(model, i);
    380             if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) {
    381                 supported[i] = true;
    382             }
    383         }
    384         cb(ErrorStatus::NONE, supported);
    385         return Void();
    386     }
    387 
    388     Return<void> getNumberOfCacheFilesNeeded(getNumberOfCacheFilesNeeded_cb cb) override {
    389         cb(ErrorStatus::NONE, /*numModelCache=*/1, /*numDataCache=*/1);
    390         return Void();
    391     }
    392 
    393     Return<ErrorStatus> prepareModelFromCache(
    394             const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const HidlToken&,
    395             const sp<V1_2::IPreparedModelCallback>& callback) override {
    396         callback->notify_1_2(ErrorStatus::NONE, new PartitioningPreparedModel);
    397         return ErrorStatus::NONE;
    398     }
    399 
    400    private:
    401     std::string mVersionString;
    402     Capabilities mCapabilities;
    403     uint32_t mOperationMask;
    404     OEM mOEM;
    405 };
    406 
    407 // Like PartitioningDriver, but implementing 1.1
    408 class PartitioningDriverV1_1 : public V1_1::IDevice {
    409    public:
    410     PartitioningDriverV1_1(const char* name, const char* version, Capabilities capabilities,
    411                            uint32_t operationMask,
    412                            PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    413         : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {}
    414     Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override {
    415         return mDriverV1_2->getCapabilities_1_1(_hidl_cb);
    416     }
    417     Return<void> getSupportedOperations_1_1(const V1_1::Model& model,
    418                                             getSupportedOperations_1_1_cb _hidl_cb) override {
    419         return mDriverV1_2->getSupportedOperations_1_1(model, _hidl_cb);
    420     }
    421     Return<ErrorStatus> prepareModel_1_1(
    422             const V1_1::Model& model, ExecutionPreference preference,
    423             const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
    424         return mDriverV1_2->prepareModel_1_1(model, preference, actualCallback);
    425     }
    426     Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); }
    427     Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
    428         return mDriverV1_2->getCapabilities(_hidl_cb);
    429     }
    430     Return<void> getSupportedOperations(const V1_0::Model& model,
    431                                         getSupportedOperations_cb _hidl_cb) override {
    432         return mDriverV1_2->getSupportedOperations(model, _hidl_cb);
    433     }
    434     Return<ErrorStatus> prepareModel(
    435             const V1_0::Model& model,
    436             const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
    437         return mDriverV1_2->prepareModel(model, actualCallback);
    438     }
    439 
    440    private:
    441     const sp<V1_2::IDevice> mDriverV1_2;
    442 };
    443 
    444 // Like PartitioningDriver, but implementing 1.0
    445 class PartitioningDriverV1_0 : public V1_0::IDevice {
    446    public:
    447     PartitioningDriverV1_0(const char* name, const char* version, Capabilities capabilities,
    448                            uint32_t operationMask,
    449                            PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    450         : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {}
    451     Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
    452         return mDriverV1_2->getCapabilities(_hidl_cb);
    453     }
    454     Return<void> getSupportedOperations(const V1_0::Model& model,
    455                                         getSupportedOperations_cb _hidl_cb) override {
    456         return mDriverV1_2->getSupportedOperations(model, _hidl_cb);
    457     }
    458     Return<ErrorStatus> prepareModel(
    459             const V1_0::Model& model,
    460             const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
    461         return mDriverV1_2->prepareModel(model, actualCallback);
    462     }
    463     Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); }
    464 
    465    private:
    466     const sp<V1_2::IDevice> mDriverV1_2;
    467 };
    468 
    469 // This class adds some simple abstractions and utilities on top of
    470 // WrapperModel.  For example, it provides methods that work in terms of
    471 // operation kind (0..7); and because we care about graph topology rather than
    472 // details of operand types and values, it greatly simplifies the process of
    473 // creating operands.
    474 class PartitioningModel : private WrapperModel {
    475    public:
    476     using WrapperModel::finish;
    477     using WrapperModel::getHandle;
    478     using WrapperModel::identifyInputsAndOutputs;
    479     using WrapperModel::isValid;
    480     using WrapperModel::relaxComputationFloat32toFloat16;
    481 
    482     // Create a tensor operand of the specified type, and return the
    483     // corresponding operand index.
    484     uint32_t addFloatOperand() { return addOperand(WrapperType::TENSOR_FLOAT32); }
    485     uint32_t addQuantOperand() { return addOperand(WrapperType::TENSOR_QUANT8_ASYMM); }
    486 
    487     // Create an operand of the specified type, and return the corresponding
    488     // operand index.
    489     uint32_t addOperand(WrapperType wrapperType) {
    490         switch (static_cast<int>(wrapperType)) {
    491             case ANEURALNETWORKS_BOOL:
    492             case ANEURALNETWORKS_FLOAT16:
    493             case ANEURALNETWORKS_FLOAT32:
    494             case ANEURALNETWORKS_INT32:
    495             case ANEURALNETWORKS_UINT32:
    496             case ANEURALNETWORKS_OEM_SCALAR: {
    497                 WrapperOperandType wrapperOperandType(wrapperType, {});
    498                 mWrapperOperandType.push_back(wrapperOperandType);
    499                 return WrapperModel::addOperand(&wrapperOperandType);
    500             }
    501 
    502             case ANEURALNETWORKS_TENSOR_BOOL8:
    503             case ANEURALNETWORKS_TENSOR_FLOAT16:
    504             case ANEURALNETWORKS_TENSOR_FLOAT32:
    505             case ANEURALNETWORKS_TENSOR_OEM_BYTE: {
    506                 WrapperOperandType wrapperOperandType(wrapperType, {1});
    507                 mWrapperOperandType.push_back(wrapperOperandType);
    508                 return WrapperModel::addOperand(&wrapperOperandType);
    509             }
    510 
    511             case ANEURALNETWORKS_TENSOR_INT32:
    512             case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
    513             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM:
    514             case ANEURALNETWORKS_TENSOR_QUANT16_ASYMM:
    515             case ANEURALNETWORKS_TENSOR_QUANT16_SYMM: {
    516                 WrapperOperandType wrapperOperandType(wrapperType, {1}, 1.0f);
    517                 mWrapperOperandType.push_back(wrapperOperandType);
    518                 return WrapperModel::addOperand(&wrapperOperandType);
    519             }
    520 
    521             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL: {
    522                 WrapperOperandType wrapperOperandType(wrapperType, {1}, 0.0f, 0,
    523                                                       WrapperSymmPerChannelQuantParams({1.0f}, 0));
    524                 mWrapperOperandType.push_back(wrapperOperandType);
    525                 return WrapperModel::addOperand(&wrapperOperandType);
    526             }
    527 
    528             default:
    529                 ADD_FAILURE() << "Unexpected type " << static_cast<uint32_t>(wrapperType);
    530                 return ~uint32_t(0);
    531         }
    532     }
    533 
    534     enum class Dimensioned { NO, YES };
    535 
    536     // Create a V1_0 operation with two inputs and one output, specifying the
    537     // operation kind (where 0 is the first V1_0 operation) and the input
    538     // operand indexes.
    539     // Returns the output operand index.
    540     uint32_t addOperation2To1V1_0(uint32_t operation, const uint32_t input0, const uint32_t input1,
    541                                   Dimensioned dimensionedOutput = Dimensioned::YES) {
    542         CHECK_LE(operation, kLastEncodingV1_0 - kFirstEncodingV1_0);
    543         return addOperation2To1(operation + kFirstEncodingV1_0, input0, input1, dimensionedOutput);
    544     }
    545 
    546     // Create a V1_1 operation with two inputs and one output, specifying the
    547     // operation kind (where 0 is the first V1_1 operation) and the input
    548     // operand indexes.
    549     // Returns the output operand index.
    550     uint32_t addOperation2To1V1_1(uint32_t operation, const uint32_t input0, const uint32_t input1,
    551                                   Dimensioned dimensionedOutput = Dimensioned::YES) {
    552         CHECK_LE(operation, kLastEncodingV1_1 - kFirstEncodingV1_1);
    553         return addOperation2To1(operation + kFirstEncodingV1_1, input0, input1, dimensionedOutput);
    554     }
    555 
    556     // Create a V1_2 operation with two inputs and one output, specifying the
    557     // operation kind (where 0 is the first V1_2 operation) and the input
    558     // operand indexes.
    559     // Returns the output operand index.
    560     uint32_t addOperation2To1V1_2(uint32_t operation, const uint32_t input0, const uint32_t input1,
    561                                   Dimensioned dimensionedOutput = Dimensioned::YES) {
    562         CHECK_LE(operation, kLastEncodingV1_2 - kFirstEncodingV1_2);
    563         return addOperation2To1(operation + kFirstEncodingV1_2, input0, input1, dimensionedOutput);
    564     }
    565 
    566     // Create an OEM operation with one input and one output,
    567     // specifying the input operand index.  Returns the output operand
    568     // index.
    569     uint32_t addOperationOEM1To1(const uint32_t input,
    570                                  Dimensioned dimensionedOutput = Dimensioned::YES) {
    571         uint32_t output = addOperandOfSameType(input, dimensionedOutput);
    572         addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output });
    573         return output;
    574     }
    575 
    576     // Run the partitioning algorithm to create an ExecutionPlan.
    577     int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
    578                          ExecutePreference preference, ExecutionPlan* plan) {
    579         return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork(
    580             devices, static_cast<uint32_t>(preference), plan);
    581     }
    582 
    583 #ifdef VERBOSE
    584     // This is a debugging utility function.
    585     void dump(const char* name) const {
    586         const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle());
    587         ::dump(name, mb);
    588     }
    589 #endif
    590 
    591 private:
    592  // Create an operation with two inputs and one output, specifying
    593  // the operation kind and the input operand indexes.
    594  // Returns the output operand index.
    595  uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1,
    596                            Dimensioned dimensionedOutput = Dimensioned::YES) {
    597      auto it = firstEncodingToOperation.lower_bound(operation);
    598      CHECK(it != firstEncodingToOperation.end());
    599      ANeuralNetworksOperationType type = it->second.first;
    600      if (it->second.second) {
    601          int32_t fuseCode = operation - it->first;
    602          uint32_t input2 = addIntOperand(fuseCode);
    603          uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
    604          addOperation(type, {input0, input1, input2}, {output});
    605          return output;
    606      } else {
    607          uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
    608          addOperation(type, {input0, input1}, {output});
    609          return output;
    610      }
    611  }
    612 
    613  // Create a scalar integer operand of the specified value, and
    614  // return the corresponding operand index.
    615  uint32_t addIntOperand(int32_t value) {
    616      uint32_t operand = addOperand(WrapperType::INT32);
    617      setOperandValue(operand, &value, sizeof(value));
    618      return operand;
    619     }
    620 
    621     // Create an operand of the same type as the specified operand,
    622     // and return the operand index of the new operand.
    623     uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
    624         WrapperOperandType type = mWrapperOperandType.at(operand);
    625         for (auto& dimension : type.dimensions) {
    626             dimension = (dimensioned == Dimensioned::YES);
    627         }
    628         mWrapperOperandType.push_back(type);
    629         return WrapperModel::addOperand(&type);
    630     }
    631 
    632     // operand index to operand type
    633     std::vector<WrapperOperandType> mWrapperOperandType;
    634 };
    635 
    636 // This class adds some utilities on top of WrapperCompilation.
    637 class PartitioningCompilation : public WrapperCompilation {
    638 public:
    639  PartitioningCompilation(const PartitioningModel* model,
    640                          const std::vector<std::shared_ptr<Device>>& devices) {
    641      ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model->getHandle());
    642      CompilationBuilder* c = nullptr;
    643      int result = m->createCompilation(&c, devices);
    644      EXPECT_EQ(result, 0);
    645      mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c);
    646  }
    647 
    648  Result setPartitioning(uint32_t partitioning) {
    649      return static_cast<Result>(builder()->setPartitioning(partitioning));
    650     }
    651 
    652     using WrapperCompilation::finish;
    653 
    654     const ExecutionPlan& getExecutionPlan() const {
    655         return builder()->forTest_getExecutionPlan();
    656     }
    657 
    658 private:
    659     CompilationBuilder* builder() {
    660         return reinterpret_cast<CompilationBuilder*>(getHandle());
    661     }
    662 
    663     const CompilationBuilder* builder() const {
    664         return reinterpret_cast<const CompilationBuilder*>(getHandle());
    665     }
    666 };
    667 
    668 #ifdef VERBOSE
    669 #define RETURN_TRUE()                                                          \
    670     {                                                                          \
    671         std::cerr << "returning true from " << __LINE__ << std::endl;          \
    672         return true;                                                           \
    673     }
    674 #else
    675 #define RETURN_TRUE()                                                          \
    676     {                                                                          \
    677         return true;                                                           \
    678     }
    679 #endif
    680 #ifdef VERBOSE
    681 #define RETURN_FALSE(MESSAGE)                                                  \
    682     {                                                                          \
    683         std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \
    684         return false;                                                          \
    685     }
    686 #else
    687 #define RETURN_FALSE(MESSAGE)                                                  \
    688     {                                                                          \
    689         return false;                                                          \
    690     }
    691 #endif
    692 
    693 class PartitioningTest : public ::testing::Test {
    694 protected:
    695     using RemapVectorType = ExecutionStep::RemapVectorType;
    696     using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType;
    697 
    698     virtual void SetUp() {
    699     }
    700 
    701     // From a vector of DeviceSpecification, create a vector of
    702     // Devices.
    703     struct DeviceSpecification {
    704         DeviceSpecification(const std::string& name, const Capabilities& capabilities,
    705                             uint32_t operationMask,
    706                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    707             : mName(name),
    708               mVersionString(kVersionString),
    709               mCapabilities(capabilities),
    710               mOperationMask(operationMask),
    711               mOEM(oem) {}
    712         DeviceSpecification(const std::string& name, float perf, uint32_t operationMask,
    713                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    714             : DeviceSpecification(name, perf, perf, operationMask, oem) {}
    715         DeviceSpecification(const std::string& name, float perf, float perfRelaxed,
    716                             uint32_t operationMask,
    717                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    718             : DeviceSpecification(name, kVersionString, perf, perfRelaxed, operationMask, oem) {}
    719         DeviceSpecification(const std::string& name, const std::string& version, float perf,
    720                             uint32_t operationMask,
    721                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    722             : DeviceSpecification(name, version, perf, perf, operationMask, oem) {}
    723         DeviceSpecification(const std::string& name, const std::string& version, float perf,
    724                             float perfRelaxed, uint32_t operationMask,
    725                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
    726             : mName(name), mVersionString(version), mOperationMask(operationMask), mOEM(oem) {
    727             PerformanceInfo perfRelaxedInfo = {.execTime = perfRelaxed, .powerUsage = perfRelaxed};
    728             mCapabilities = {.relaxedFloat32toFloat16PerformanceScalar = perfRelaxedInfo,
    729                              .relaxedFloat32toFloat16PerformanceTensor = perfRelaxedInfo,
    730                              .operandPerformance = ::android::nn::nonExtensionOperandPerformance(
    731                                      {.execTime = perf, .powerUsage = perf})};
    732         }
    733         DeviceSpecification(const std::string& name, float perf, HalVersion halVersion,
    734                             uint32_t operationMaskV1_0, uint32_t operationMaskV1_1 = 0,
    735                             uint32_t operationMaskV1_2 = 0)
    736             : DeviceSpecification(name, perf, perf,
    737                                   makeOperationMask(halVersion, operationMaskV1_0,
    738                                                     operationMaskV1_1, operationMaskV1_2)) {
    739             mHalVersion = halVersion;
    740         }
    741 
    742         std::string mName;
    743         std::string mVersionString;
    744         Capabilities mCapabilities;
    745         HalVersion mHalVersion = HalVersion::LATEST;
    746         uint32_t mOperationMask;
    747         PartitioningDriver::OEM mOEM = PartitioningDriver::OEMNo;
    748 
    749         static constexpr char kVersionString[] = "JUST_AN_EXAMPLE";
    750 
    751        private:
    752         // This function takes three operation masks aligned at the low-order
    753         // bit -- one mask each for V1_0, V1_1, and V1_2 -- and produces a single
    754         // composite operation mask, formed by shifting each of the input
    755         // operation masks appropriately and ORing the results together.
    756         //
    757         // For convenience, any bits of an input mask that are too high order
    758         // for that mask are discarded -- this allows ~0 to be a legal input
    759         // mask.
    760         //
    761         // For the sake of example, assume that each low order mask is 4 bits
    762         // wide, and take some artistic license to write literals in binary.
    763         // Then:
    764         //
    765         //     assert(makeOperationMask(HalVersion::V1_2, 0b0110, 0b1001, 0b0101) ==
    766         //            0b 0101 1001 0110);
    767         //
    768         // This is used by a DeviceSpecification constructor to build a mask of
    769         // operations to be supported by the device.
    770         static uint32_t makeOperationMask(HalVersion halVersion, uint32_t operationMaskV1_0,
    771                                           uint32_t operationMaskV1_1, uint32_t operationMaskV1_2) {
    772             if (halVersion < HalVersion::V1_2) {
    773                 CHECK(!operationMaskV1_2);
    774             }
    775             if (halVersion < HalVersion::V1_1) {
    776                 CHECK(!operationMaskV1_1);
    777             }
    778             auto maskOfWidth = [](uint32_t width) -> uint32_t { return (1U << width) - 1; };
    779             static const uint32_t kOperationMaskV1_0 =
    780                     maskOfWidth(kLastEncodingV1_0 - kFirstEncodingV1_0 + 1);
    781             static const uint32_t kOperationMaskV1_1 =
    782                     maskOfWidth(kLastEncodingV1_1 - kFirstEncodingV1_1 + 1);
    783             static const uint32_t kOperationMaskV1_2 =
    784                     maskOfWidth(kLastEncodingV1_2 - kFirstEncodingV1_2 + 1);
    785             return ((operationMaskV1_0 & kOperationMaskV1_0) << kFirstEncodingV1_0) |
    786                    ((operationMaskV1_1 & kOperationMaskV1_1) << kFirstEncodingV1_1) |
    787                    ((operationMaskV1_2 & kOperationMaskV1_2) << kFirstEncodingV1_2);
    788         }
    789     };
    790     static std::vector<std::shared_ptr<Device>> makeDevices(
    791             std::vector<DeviceSpecification> specifications) {
    792         std::vector<std::shared_ptr<Device>> devices;
    793         for (const auto& specification : specifications) {
    794             V1_0::IDevice* halDriver = nullptr;
    795             switch (specification.mHalVersion) {
    796                 case HalVersion::V1_2:
    797                     halDriver = new PartitioningDriver(
    798                             specification.mName.c_str(), specification.mVersionString.c_str(),
    799                             specification.mCapabilities, specification.mOperationMask,
    800                             specification.mOEM);
    801                     break;
    802                 case HalVersion::V1_1:
    803                     halDriver = new PartitioningDriverV1_1(
    804                             specification.mName.c_str(), specification.mVersionString.c_str(),
    805                             specification.mCapabilities, specification.mOperationMask,
    806                             specification.mOEM);
    807                     break;
    808                 case HalVersion::V1_0:
    809                     halDriver = new PartitioningDriverV1_0(
    810                             specification.mName.c_str(), specification.mVersionString.c_str(),
    811                             specification.mCapabilities, specification.mOperationMask,
    812                             specification.mOEM);
    813                     break;
    814                 default:
    815                     ADD_FAILURE() << "Unexpected";
    816             }
    817             auto device = DeviceManager::forTest_makeDriverDevice(specification.mName, halDriver);
    818             devices.push_back(device);
    819         }
    820         devices.push_back(DeviceManager::getCpuDevice());
    821         return devices;
    822     }
    823 
    824     /*-- Graph comparision ----------------------------------------------------------------*/
    825 
    826     // An operand with certain values for its lifetime does not have a
    827     // defining operation in the graph.  For the purposes of the graph
    828     // comparison algorithm, we encode the "defining operation" index of
    829     // such an operand as follows:
    830     // - NO_VALUE       kPseudoDefiningOperationNoValue
    831     // - MODEL_INPUT    kPseudoDefiningOperationModelInput0 + (position in list of inputs)
    832     // - CONSTANT_COPY  kPseudoDefiningOperationConstantCopy0 + (constant value)
    833     //                    Note: For the graphs we build in this test, we
    834     //                          only expect to see 4-byte constants within
    835     //                          a very restricted range, so we only make
    836     //                          room for such constants in our encoding
    837     //                          space.
    838     // We do not expect to see CONSTANT_REFERENCE, and so we do not handle
    839     // it.
    840     //
    841     // The encoding is intended to be relatively human readable; it is not
    842     // designed to represent some optimal balance of ranges for the items
    843     // within its scope (actual operations, inputs, constants).
    844 
    845     enum PseudoDefiningOperationEncodings : uint32_t {
    846         kPseudoDefiningOperationModelInput0   = 0x80000000U,
    847         kPseudoDefiningOperationConstantCopy0 = 0x90000000U,
    848         kPseudoDefiningOperationNoValue       = 0xeeeeeeeeU,
    849 
    850         // lowest value for special encoding
    851         kPseudoDefiningOperationBase          = 0x80000000U,
    852 
    853         // range of encoded input or constant
    854         kPseudoDefiningOperationRange         = 0x10000000U,
    855     };
    856 
    857     // Build a map from operand to defining operation.
    858     // TODO: Replace map with vector?
    859     void buildDefinitionMap(const ModelBuilder* model,
    860                             std::map<uint32_t, uint32_t>* defMap) {
    861         // actual definitions
    862         ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase);
    863         for (uint32_t i = 0, e = model->operationCount(); i < e; i++) {
    864             const Operation& operation = model->getOperation(i);
    865             for (uint32_t output : operation.outputs) {
    866                 (*defMap)[output] = i;
    867             }
    868         }
    869         // inputs
    870         ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange);
    871         for (uint32_t i = 0, e = model->inputCount(); i < e; i++) {
    872             (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i;
    873         }
    874         // look for NO_VALUE and CONSTANT_COPY
    875         for (uint32_t i = 0, e = model->operandCount(); i < e; i++) {
    876             const Operand& operand = model->getOperand(i);
    877             switch (operand.lifetime) {
    878                 case OperandLifeTime::NO_VALUE:
    879                     (*defMap)[i] = kPseudoDefiningOperationNoValue;
    880                     break;
    881                 case OperandLifeTime::CONSTANT_COPY: {
    882                     ASSERT_EQ(operand.location.length, sizeof(uint32_t));
    883                     uint32_t value;
    884                     memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t));
    885                     ASSERT_LT(value, kPseudoDefiningOperationNoValue);
    886                     (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value;
    887                     break;
    888                 }
    889                 case OperandLifeTime::TEMPORARY_VARIABLE:
    890                 case OperandLifeTime::MODEL_INPUT:
    891                 case OperandLifeTime::MODEL_OUTPUT:
    892                     // already handled
    893                     break;
    894                 default:
    895                     FAIL();
    896                     break;
    897             }
    898         }
    899         // sanity check
    900         ASSERT_EQ(model->operandCount(), defMap->size());
    901     }
    902 
    903 #ifdef VERBOSE
    904     void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) {
    905         auto writeNum = [](uint32_t num) {
    906             if (num >= kPseudoDefiningOperationBase) {
    907                 std::cout << "0x" << std::hex << num << std::dec;
    908             } else {
    909                 std::cout << num;
    910             }
    911         };
    912 
    913         std::cout << name << ": { ";
    914         bool gotOne = false;
    915         for (const auto& entry : *aMap) {
    916             if (gotOne) {
    917                 std::cout << ", ";
    918             } else {
    919                 gotOne = true;
    920             }
    921             std::cout << "(";
    922             writeNum(entry.first);
    923             std::cout << ", ";
    924             writeNum(entry.second);
    925             std::cout << ")";
    926         }
    927         std::cout << " }" << std::endl;
    928     }
    929 #endif
    930 
    931     bool compare(const Operand& operandA, const Operand& operandB) {
    932         if (operandA.type != operandB.type ||
    933             operandA.dimensions != operandB.dimensions ||
    934             operandA.numberOfConsumers != operandB.numberOfConsumers ||
    935             operandA.scale != operandB.scale ||
    936             operandA.zeroPoint != operandB.zeroPoint) {
    937             return false;
    938         }
    939         return true;
    940     }
    941 
    942     // Compare two graphs.  We ignore operand and operation indexes (i.e.,
    943     // two nodes can be the same even if they are numbered differently)
    944     // but we also ignore semantics (e.g., even if an operation kind is
    945     // such that the operand is commutative, we still pay attention to the
    946     // order of its input operands).
    947     //
    948     // The comparison algorithm works by walking modelA from outputs
    949     // towards inputs, along the edge from each operand to its
    950     // defining operation, and then along the edges to the operation's
    951     // input operands.  At each step along the way, we try to match up
    952     // operands and operations from modelA with equivalent operands
    953     // and operations from modelB.
    954     //
    955     // We start by assuming that modelA's outputs and modelB's outputs
    956     // match positionally (e.g., modelA's first output operand is
    957     // equivalent to modelB's first output operand).  Once we've
    958     // discovered two equivalent operands (such as those outputs), we
    959     // place them in a work queue.  We repeatedly pull operands off
    960     // the queue and compare their defining operations and those
    961     // operations' input operands, to discover more pairs of
    962     // equivalent operands.  If we ever find operations that do not
    963     // match (e.g., because operation kind differs), or operands that
    964     // do not match (e.g., because operand type differs); or if we
    965     // ever find a conflict (we've already decided that operand A's
    966     // equivalent operand is B0, but it looks like we need its
    967     // equivalent operand to be B1); then the graphs compare unequal.
    968     // Otherwise, we'll eventually exhaust the work queue, and
    969     // conclude that the graphs compare equal.
    970     //
    971     // As a side effect of the comparison, we produce a map
    972     // *inputsAndOutputsBToA that maps from each of the model input and output
    973     // operand numbers of modelB to the corresponding operand numbers of modelA.
    974     // If the comparison returns false, the contents of the map are undefined.
    975     bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB,
    976                  std::map<uint32_t, uint32_t>* inputsAndOutputsBToA) {
    977         CHECK(inputsAndOutputsBToA != nullptr);
    978         EXPECT_TRUE(inputsAndOutputsBToA->empty());
    979 
    980 #ifdef VERBOSE
    981         ::dump("compare(A)", modelA);
    982         ::dump("compare(B)", modelB);
    983 #endif
    984 
    985         if (modelA->operandCount()   != modelB->operandCount()   ||
    986             modelA->operationCount() != modelB->operationCount() ||
    987             modelA->inputCount()     != modelB->inputCount()     ||
    988             modelA->outputCount()    != modelB->outputCount()) {
    989             RETURN_FALSE();
    990         }
    991 
    992         // Maps from operand index to index of defining operation.
    993         std::map<uint32_t, uint32_t> defsA, defsB;
    994         buildDefinitionMap(modelA, &defsA);
    995         buildDefinitionMap(modelB, &defsB);
    996         if (HasFatalFailure()) return false;
    997 
    998         // Maps from operand index in modelA to equivalent operand index
    999         // in modelB; and from operation index in modelA to equivalent
   1000         // operation index in modelB.
   1001         std::map<uint32_t, uint32_t> equivalentOperandsAToB;
   1002         std::map<uint32_t, uint32_t> equivalentOperationsAToB;
   1003 
   1004         // Queue of operand indexes from modelA, each of whose defining
   1005         // operations are to be checked for equivalence with modelB.
   1006         std::queue<uint32_t> workQueueOperandsA;
   1007 
   1008         // Seed operand equivalence map and work queue from model outputs.
   1009         for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) {
   1010             uint32_t outputA = modelA->getOutputOperandIndex(i);
   1011             uint32_t outputB = modelB->getOutputOperandIndex(i);
   1012             if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
   1013                 RETURN_FALSE();
   1014             }
   1015             equivalentOperandsAToB[outputA] = outputB;
   1016             workQueueOperandsA.push(outputA);
   1017         }
   1018 
   1019 #ifdef VERBOSE
   1020         dump("defsA", &defsA);
   1021         dump("defsB", &defsB);
   1022 #endif
   1023 
   1024         // Process the queue.
   1025         uint32_t pseudoDefinitionCount = 0;
   1026         while (!workQueueOperandsA.empty()) {
   1027 #ifdef VERBOSE
   1028             dump("equivalentOperandsAToB", &equivalentOperandsAToB);
   1029             dump("equivalentOperationsAToB", &equivalentOperationsAToB);
   1030 #endif
   1031             uint32_t operandIndexA = workQueueOperandsA.front();
   1032 #ifdef VERBOSE
   1033             std::cout << "operandIndexA: " << operandIndexA << std::endl;
   1034 #endif
   1035             workQueueOperandsA.pop();
   1036             uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA);
   1037 
   1038             uint32_t operationIndexA = defsA.at(operandIndexA);
   1039             uint32_t operationIndexB = defsB.at(operandIndexB);
   1040             auto it = equivalentOperationsAToB.find(operationIndexA);
   1041             if (it != equivalentOperationsAToB.end()) {
   1042                 if (it->second != operationIndexB) {
   1043                     RETURN_FALSE();
   1044                 }
   1045                 continue;
   1046             }
   1047 
   1048             // We haven't identified an equivalent operation for
   1049             // operationIndexA.
   1050 
   1051             if ((operationIndexA >= kPseudoDefiningOperationBase) !=
   1052                 (operationIndexB >= kPseudoDefiningOperationBase)) {
   1053                 RETURN_FALSE();
   1054             }
   1055             // Either both operands have pseudo-definitions, or neither
   1056             // does.
   1057             if (operationIndexA >= kPseudoDefiningOperationBase) {
   1058                 // Both operands have pseudo-definitions.
   1059                 if (operationIndexA != operationIndexB) {
   1060                     RETURN_FALSE();
   1061                 }
   1062                 equivalentOperationsAToB[operationIndexA] = operationIndexB;
   1063                 ++pseudoDefinitionCount;
   1064                 continue;
   1065             }
   1066 
   1067             // If we get here, neither operation A nor operation B is a
   1068             // pseudo-definition.
   1069 
   1070             const Operation& operationA = modelA->getOperation(operationIndexA);
   1071             const Operation& operationB = modelB->getOperation(operationIndexB);
   1072             if (operationA.type != operationB.type ||
   1073                 operationA.inputs.size() != operationB.inputs.size() ||
   1074                 operationA.outputs.size() != operationB.outputs.size()) {
   1075                 RETURN_FALSE();
   1076             }
   1077             equivalentOperationsAToB[operationIndexA] = operationIndexB;
   1078             for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) {
   1079                 uint32_t inputA = operationA.inputs[i];
   1080                 uint32_t inputB = operationB.inputs[i];
   1081                 auto it = equivalentOperandsAToB.find(inputA);
   1082                 if (it != equivalentOperandsAToB.end()) {
   1083                     if (it->second != inputB) {
   1084                         RETURN_FALSE();
   1085                     }
   1086                     continue;
   1087                 }
   1088                 // We haven't identified an equivalent operand for inputA.
   1089                 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
   1090                     RETURN_FALSE();
   1091                 }
   1092                 equivalentOperandsAToB[inputA] = inputB;
   1093                 workQueueOperandsA.push(inputA);
   1094             }
   1095         }
   1096 
   1097         // Sanity check
   1098         if (modelA->operandCount() != defsA.size() ||
   1099             modelA->operandCount() != defsB.size() ||
   1100             modelA->operandCount() != equivalentOperandsAToB.size() ||
   1101             modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) {
   1102             RETURN_FALSE();
   1103         }
   1104 
   1105         // Build *inputsAndOutputsBToA
   1106         for (uint32_t aInputIndex : modelA->getInputOperandIndexes()) {
   1107             (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aInputIndex)] = aInputIndex;
   1108         }
   1109         for (uint32_t aOutputIndex : modelA->getOutputOperandIndexes()) {
   1110             (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aOutputIndex)] = aOutputIndex;
   1111         }
   1112 
   1113         RETURN_TRUE();
   1114     }
   1115 
   1116     /*-------------------------------------------------------------------------------------*/
   1117 
   1118     // As a side effect of the comparison, we produce a map
   1119     // *inputsAndOutputsModelToStep that maps from each of the model input and
   1120     // output operand numbers of "model" to the corresponding operand numbers of
   1121     // the submodel from "step".  If the comparison returns false, the contents
   1122     // of the map are undefined.
   1123     bool compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
   1124                  std::shared_ptr<Device> device,
   1125                  std::map<uint32_t, uint32_t>* inputsAndOutputsModelToStep) {
   1126         return (step->getDevice() == device) &&
   1127                compare(step->getSubModel(),
   1128                        reinterpret_cast<const ModelBuilder*>(model->getHandle()),
   1129                        inputsAndOutputsModelToStep);
   1130     }
   1131 
   1132     void compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
   1133                  std::shared_ptr<Device> device, const RemapVectorType& modelInputs,
   1134                  const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsSubModelInputs,
   1135                  const SubModelOutputSetType& tempsAsSubModelOutputs,
   1136                  const RemapVectorType& outputsAsSubModelInputs) {
   1137         std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep;
   1138         ASSERT_NO_FATAL_FAILURE(
   1139                 ASSERT_TRUE(compare(step, model, device, &inputsAndOutputsModelToStep)));
   1140         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelInputs(),
   1141                                         modelInputs));
   1142         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelOutputs(),
   1143                                         modelOutputs));
   1144         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
   1145                                         step->getTempsAsSubModelInputs(), tempsAsSubModelInputs));
   1146         ASSERT_TRUE(compareSubModelOutputSets(inputsAndOutputsModelToStep,
   1147                                               step->getTempsAsSubModelOutputs(),
   1148                                               tempsAsSubModelOutputs));
   1149         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
   1150                                         step->getOutputsAsSubModelInputs(),
   1151                                         outputsAsSubModelInputs));
   1152     }
   1153 
   1154    private:
   1155     static bool compareRemapVectors(const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
   1156                                     const RemapVectorType& step, RemapVectorType model) {
   1157         std::transform(model.begin(), model.end(), model.begin(),
   1158                        [&inputsAndOutputsModelToStep](const RemapVectorType::value_type& val) {
   1159                            return std::make_pair(val.first,
   1160                                                  inputsAndOutputsModelToStep.at(val.second));
   1161                        });
   1162         return step == model;
   1163     }
   1164 
   1165     static bool compareSubModelOutputSets(
   1166             const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
   1167             const SubModelOutputSetType& step, const SubModelOutputSetType& model) {
   1168         SubModelOutputSetType modelTransformed;
   1169         std::transform(
   1170                 model.begin(), model.end(), std::inserter(modelTransformed, modelTransformed.end()),
   1171                 [&inputsAndOutputsModelToStep](const SubModelOutputSetType::value_type& val) {
   1172                     return std::make_pair(val.first, inputsAndOutputsModelToStep.at(val.second));
   1173                 });
   1174         return step == modelTransformed;
   1175     }
   1176 };
   1177 
   1178 TEST_F(PartitioningTest, SimpleModel) {
   1179     PartitioningModel model;
   1180     uint32_t opnd0 = model.addFloatOperand();
   1181     uint32_t opnd1 = model.addFloatOperand();
   1182     uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
   1183     uint32_t opnd3 = model.addFloatOperand();
   1184     uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3);
   1185     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
   1186     model.finish();
   1187     ASSERT_TRUE(model.isValid());
   1188 
   1189     // Simple partition (two devices are each capable of everything, one is the best).
   1190     // No need to compare the original model to the model from the plan -- we
   1191     // didn't actually do any partitioning.
   1192     const auto devicesA = makeDevices({{"bad", 0.9, ~0U}, {"good", 0.5, ~0U}});
   1193     ExecutionPlan planA;
   1194     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
   1195               ANEURALNETWORKS_NO_ERROR);
   1196     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1197     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
   1198     ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "good");
   1199 
   1200     // Simple partition (two devices are each capable of everything, none better than CPU).
   1201     // No need to compare the original model to the model from the plan -- we
   1202     // didn't actually do any partitioning.
   1203     const auto devicesC = makeDevices({{"bad", 1.1, ~0U}, {"bad2", 1.0, ~0U}});
   1204     ExecutionPlan planC;
   1205     ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC),
   1206               ANEURALNETWORKS_NO_ERROR);
   1207     ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1208     ASSERT_EQ(planC.forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
   1209 
   1210     // Compound partition (two devices, each is capable of one of the
   1211     // two operations).  We could do more extensive checking here --
   1212     // for example, verify that each step within the plan has the
   1213     // correct (model and submodel)x(inputs and outputs).
   1214     const auto devicesB = makeDevices({{"0", 0.9, 1 << 0}, {"1", 0.5, 1 << 1}});
   1215     ExecutionPlan planB;
   1216     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
   1217               ANEURALNETWORKS_NO_ERROR);
   1218     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
   1219     const auto& stepsB = planB.forTest_compoundGetSteps();
   1220     ASSERT_EQ(stepsB.size(), size_t(2));
   1221     {
   1222         // Build a model to compare against the submodel from stepsB[0].
   1223         PartitioningModel modelB0;
   1224         uint32_t b0Opnd0 = modelB0.addFloatOperand();
   1225         uint32_t b0Opnd1 = modelB0.addFloatOperand();
   1226         uint32_t b0Opnd2 = modelB0.addOperation2To1V1_0(0, b0Opnd0, b0Opnd1);
   1227         modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 });
   1228         modelB0.finish();
   1229         ASSERT_TRUE(modelB0.isValid());
   1230 
   1231         ASSERT_NO_FATAL_FAILURE(
   1232                 compare(stepsB[0], &modelB0, devicesB[0],
   1233                         RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}},  // modelInputs
   1234                         RemapVectorType{},                                    // modelOutputs
   1235                         RemapVectorType{},                        // tempsAsSubModelInputs
   1236                         SubModelOutputSetType{{opnd2, b0Opnd2}},  // tempsAsSubModelOutputs
   1237                         RemapVectorType{}));                      // outputsAsSubModelInputs;
   1238     }
   1239     {
   1240         // Build a model to compare against the submodel from stepsB[1].
   1241         PartitioningModel modelB1;
   1242         uint32_t b1Opnd2 = modelB1.addFloatOperand();
   1243         uint32_t b1Opnd3 = modelB1.addFloatOperand();
   1244         uint32_t b1Opnd4 = modelB1.addOperation2To1V1_0(1, b1Opnd2, b1Opnd3);
   1245         // Note: In the partitioning algorithm, submodel inputs follow
   1246         // model inputs.  In the original model "model", opnd2 is not
   1247         // an input; so in the submodel "modelB1", the corresponding
   1248         // input b1Opnd2 is a submodel input, and must follow the
   1249         // model input b1Opnd3.
   1250         modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 });
   1251         modelB1.finish();
   1252         ASSERT_TRUE(modelB1.isValid());
   1253 
   1254         ASSERT_NO_FATAL_FAILURE(compare(stepsB[1], &modelB1, devicesB[1],
   1255                                         RemapVectorType{{opnd3, b1Opnd3}},  // modelInputs
   1256                                         RemapVectorType{{opnd4, b1Opnd4}},  // modelOutputs
   1257                                         RemapVectorType{{opnd2, b1Opnd2}},  // tempsAsSubModelInputs
   1258                                         SubModelOutputSetType{},  // tempsAsSubModelOutputs
   1259                                         RemapVectorType{}));      // outputsAsSubModelInputs
   1260     }
   1261 }
   1262 
   1263 TEST_F(PartitioningTest, SliceModel) {
   1264     PartitioningModel model;
   1265     uint32_t opnd0 = model.addFloatOperand();
   1266     uint32_t opnd1 = model.addFloatOperand();
   1267     uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
   1268     uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd0, opnd1);
   1269     uint32_t opnd4 = model.addOperation2To1V1_1(0, opnd0, opnd1);
   1270     uint32_t opnd5 = model.addOperation2To1V1_2(0, opnd2, opnd3);
   1271     model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2, opnd4, opnd5});
   1272     model.finish();
   1273     ASSERT_TRUE(model.isValid());
   1274 
   1275     // Simple partition (V1_0, V1_1, V1_2 devices are available; V1_2 has best perf).
   1276     // No need to compare the original model to the model from the plan -- we
   1277     // didn't actually do any partitioning.
   1278     const auto devicesA = makeDevices({{"V1_0", 0.8, HalVersion::V1_0, ~0U},
   1279                                        {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
   1280                                        {"V1_2", 0.6, HalVersion::V1_2, ~0U, ~0U, ~0U}});
   1281     ExecutionPlan planA;
   1282     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
   1283               ANEURALNETWORKS_NO_ERROR);
   1284     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1285     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
   1286     ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "V1_2");
   1287 
   1288     // Compound partition (V1_0, V1_1, V1_2 devices are available, in decreasing
   1289     // order of performance; model is distributed across all three devices).
   1290     const auto devicesB = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U},
   1291                                        {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
   1292                                        {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}});
   1293     ExecutionPlan planB;
   1294     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
   1295               ANEURALNETWORKS_NO_ERROR);
   1296     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
   1297     const auto& stepsB = planB.forTest_compoundGetSteps();
   1298     ASSERT_EQ(stepsB.size(), size_t(3));
   1299     {
   1300         // Build a model to compare against the submodel from stepsB[0].
   1301         PartitioningModel modelB0;
   1302         uint32_t b0Opnd0 = modelB0.addFloatOperand();
   1303         uint32_t b0Opnd1 = modelB0.addFloatOperand();
   1304         uint32_t b0Opnd2 = modelB0.addOperation2To1V1_1(0, b0Opnd0, b0Opnd1);
   1305         modelB0.identifyInputsAndOutputs({b0Opnd0, b0Opnd1}, {b0Opnd2});
   1306         modelB0.finish();
   1307         ASSERT_TRUE(modelB0.isValid());
   1308 
   1309         ASSERT_NO_FATAL_FAILURE(
   1310                 compare(stepsB[0], &modelB0, devicesB[1],
   1311                         RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}},  // modelInputs
   1312                         RemapVectorType{{opnd4, b0Opnd2}},                    // modelOutputs
   1313                         RemapVectorType{},        // tempsAsSubModelInputs
   1314                         SubModelOutputSetType{},  // tempsAsSubModelOutputs
   1315                         RemapVectorType{}));      // outputsAsSubModelInputs
   1316     }
   1317     {
   1318         // Build a model to compare against the submodel from stepsB[1].
   1319         PartitioningModel modelB1;
   1320         uint32_t b1Opnd0 = modelB1.addFloatOperand();
   1321         uint32_t b1Opnd1 = modelB1.addFloatOperand();
   1322         uint32_t b1Opnd2 = modelB1.addOperation2To1V1_0(0, b1Opnd0, b1Opnd1);
   1323         uint32_t b1Opnd3 = modelB1.addOperation2To1V1_0(1, b1Opnd0, b1Opnd1);
   1324         modelB1.identifyInputsAndOutputs({b1Opnd0, b1Opnd1}, {b1Opnd2, b1Opnd3});
   1325         modelB1.finish();
   1326         ASSERT_TRUE(modelB1.isValid());
   1327 
   1328         ASSERT_NO_FATAL_FAILURE(
   1329                 compare(stepsB[1], &modelB1, devicesB[0],
   1330                         RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}},  // modelInputs
   1331                         RemapVectorType{{opnd2, b1Opnd2}},                    // modelOutputs
   1332                         RemapVectorType{},                        // tempsAsSubModelInputs
   1333                         SubModelOutputSetType{{opnd3, b1Opnd3}},  // tempsAsSubModelOutputs
   1334                         RemapVectorType{}));                      // outputsAsSubModelInputs
   1335     }
   1336     {
   1337         // Build a model to compare against the submodel from stepsB[2].
   1338         PartitioningModel modelB2;
   1339         uint32_t b2Opnd0 = modelB2.addFloatOperand();
   1340         uint32_t b2Opnd1 = modelB2.addFloatOperand();
   1341         uint32_t b2Opnd2 = modelB2.addOperation2To1V1_2(0, b2Opnd0, b2Opnd1);
   1342         // Note: In the partitioning algorithm, temps that are
   1343         // submodel inputs precede model outputs that are submodel
   1344         // inputs.  In the original model "model", opnd3 is a temp and
   1345         // opnd2 is a model output; so in the submodel "modelB2", the
   1346         // corresponding inputs b2Opnd1 and b2Opnd0 must appear in
   1347         // that order.
   1348         modelB2.identifyInputsAndOutputs({b2Opnd1, b2Opnd0}, {b2Opnd2});
   1349         modelB2.finish();
   1350         ASSERT_TRUE(modelB2.isValid());
   1351 
   1352         ASSERT_NO_FATAL_FAILURE(
   1353                 compare(stepsB[2], &modelB2, devicesB[2], RemapVectorType{},  // modelInputs
   1354                         RemapVectorType{{opnd5, b2Opnd2}},                    // modelOutputs
   1355                         RemapVectorType{{opnd3, b2Opnd1}},    // tempsAsSubModelInputs
   1356                         SubModelOutputSetType{},              // tempsAsSubModelOutputs
   1357                         RemapVectorType{{opnd2, b2Opnd0}}));  // outputsAsSubModelInputs
   1358     }
   1359 
   1360     // TODO: Make sure this still works when we have multiple devices
   1361     // of same version available for slicing. An easy (?) choice would
   1362     // be to route the two different V1_0 operations to different
   1363     // devices.
   1364 }
   1365 
   1366 TEST_F(PartitioningTest, SliceModelToEmpty) {
   1367     PartitioningModel model;
   1368     uint32_t opnd0 = model.addFloatOperand();
   1369     uint32_t opnd1 = model.addFloatOperand();
   1370     uint32_t opnd2 = model.addOperation2To1V1_2(0, opnd0, opnd1);
   1371     model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2});
   1372     model.finish();
   1373     ASSERT_TRUE(model.isValid());
   1374 
   1375     // Only the V1_2 device can handle any operations in the model.
   1376     // No need to compare the original model to the model from the plan -- we
   1377     // didn't actually do any partitioning.
   1378     const auto devices = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U},
   1379                                       {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
   1380                                       {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}});
   1381     ExecutionPlan plan;
   1382     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
   1383               ANEURALNETWORKS_NO_ERROR);
   1384     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1385     ASSERT_NE(plan.forTest_simpleGetDevice().get(), nullptr);
   1386     ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "V1_2");
   1387 }
   1388 
   1389 TEST_F(PartitioningTest, Cpu) {
   1390     // Here's a model where some operations execute only on the Cpu.
   1391     // To make things interesting, we produce three partitions --
   1392     // device, cpu, same-device.
   1393 
   1394     static const uint32_t kCpuOp = 1;
   1395     static const uint32_t kDevOp = 2;
   1396 
   1397     const auto devices = makeDevices({{"1", 0.5, 1 << kDevOp}});
   1398 
   1399     PartitioningModel model;
   1400 
   1401     uint32_t opnd0 = model.addFloatOperand();
   1402     uint32_t opnd1 = model.addFloatOperand();
   1403 
   1404     uint32_t opnd2 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd1);
   1405     uint32_t opnd3 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd2);
   1406 
   1407     uint32_t opnd4 = model.addOperation2To1V1_0(kCpuOp, opnd0, opnd3);
   1408     uint32_t opnd5 = model.addOperation2To1V1_0(kCpuOp, opnd2, opnd4);
   1409 
   1410     uint32_t opnd6 = model.addFloatOperand();
   1411 
   1412     uint32_t opnd7 = model.addOperation2To1V1_0(kDevOp, opnd3, opnd5);
   1413     uint32_t opnd8 = model.addOperation2To1V1_0(kDevOp, opnd6, opnd7);
   1414 
   1415     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 });
   1416     model.finish();
   1417     ASSERT_TRUE(model.isValid());
   1418 
   1419     ExecutionPlan plan;
   1420     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
   1421               ANEURALNETWORKS_NO_ERROR);
   1422     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
   1423     const auto& steps = plan.forTest_compoundGetSteps();
   1424     ASSERT_EQ(steps.size(), size_t(3));
   1425     {
   1426         const auto& step0 = steps[0];
   1427 
   1428         // Build a model to compare against the submodel from steps[0].
   1429         PartitioningModel model0;
   1430         uint32_t m0Opnd0 = model0.addFloatOperand();
   1431         uint32_t m0Opnd1 = model0.addFloatOperand();
   1432         uint32_t m0Opnd2 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd1);
   1433         uint32_t m0Opnd3 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd2);
   1434         model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 });
   1435         model0.finish();
   1436         ASSERT_TRUE(model0.isValid());
   1437 
   1438         ASSERT_NO_FATAL_FAILURE(
   1439                 compare(step0, &model0, devices[0],
   1440                         RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}},  // modelInputs
   1441                         RemapVectorType{},                                    // modelOutputs
   1442                         RemapVectorType{},  // tempsAsSubModelInputs
   1443                         SubModelOutputSetType{{opnd2, m0Opnd2},
   1444                                               {opnd3, m0Opnd3}},  // tempsAsSubModelOutputs
   1445                         RemapVectorType{}));                      // outputsAsSubModelInputs
   1446     }
   1447     {
   1448         const auto& step1 = steps[1];
   1449 
   1450         // Build a model to compare against the submodel from steps[1].
   1451         PartitioningModel model1;
   1452         uint32_t m1Opnd0 = model1.addFloatOperand();
   1453         uint32_t m1Opnd3 = model1.addFloatOperand();
   1454         uint32_t m1Opnd4 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd0, m1Opnd3);
   1455         uint32_t m1Opnd2 = model1.addFloatOperand();
   1456         uint32_t m1Opnd5 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd2, m1Opnd4);
   1457         model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 });
   1458         model1.finish();
   1459         ASSERT_TRUE(model1.isValid());
   1460 
   1461         ASSERT_NO_FATAL_FAILURE(compare(
   1462                 step1, &model1, DeviceManager::getCpuDevice(),
   1463                 RemapVectorType{{opnd0, m1Opnd0}},                    // modelInputs
   1464                 RemapVectorType{{opnd4, m1Opnd4}},                    // modelOutputs
   1465                 RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}},  // tempsAsSubModelInputs
   1466                 SubModelOutputSetType{{opnd5, m1Opnd5}},              // tempsAsSubModelOutputs
   1467                 RemapVectorType{}));                                  // outputsAsSubModelInputs
   1468     }
   1469     {
   1470         const auto& step2 = steps[2];
   1471 
   1472         // Build a model to compare against the submodel from steps[2].
   1473         PartitioningModel model2;
   1474         uint32_t m2Opnd3 = model2.addFloatOperand();
   1475         uint32_t m2Opnd5 = model2.addFloatOperand();
   1476         uint32_t m2Opnd7 = model2.addOperation2To1V1_0(kDevOp, m2Opnd3, m2Opnd5);
   1477         uint32_t m2Opnd6 = model2.addFloatOperand();
   1478         uint32_t m2Opnd8 = model2.addOperation2To1V1_0(kDevOp, m2Opnd6, m2Opnd7);
   1479         model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 });
   1480         model2.finish();
   1481         ASSERT_TRUE(model2.isValid());
   1482 
   1483         ASSERT_NO_FATAL_FAILURE(compare(
   1484                 step2, &model2, devices[0], RemapVectorType{{opnd6, m2Opnd6}},  // modelInputs
   1485                 RemapVectorType{{opnd8, m2Opnd8}},                              // modelOutputs
   1486                 RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}},  // tempsAsSubModelInputs
   1487                 SubModelOutputSetType{},                              // tempsAsSubModelOutputs
   1488                 RemapVectorType{}));                                  // outputsAsSubModelInputs
   1489     }
   1490 }
   1491 
   1492 TEST_F(PartitioningTest, SetPartitioning) {
   1493     PartitioningModel model;
   1494     uint32_t opnd0 = model.addFloatOperand();
   1495     uint32_t opnd1 = model.addFloatOperand();
   1496     uint32_t opnd2 =
   1497             model.addOperation2To1V1_0(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO);
   1498     uint32_t opnd3 = model.addFloatOperand();
   1499     uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3);
   1500     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
   1501     model.finish();
   1502     ASSERT_TRUE(model.isValid());
   1503 
   1504     // We expect that we cannot successfully partition, because we
   1505     // have an intermediate operand (opnd2) without dimensions, and
   1506     // this is not currently handled.
   1507 
   1508     // One device that can and should execute operation 0.
   1509     const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}});
   1510 
   1511     // Test kPartitioningNo.  We should not even attempt partitioning,
   1512     // so there should be a SIMPLE plan on CPU.
   1513     // No need to compare the original model to the model from the plan -- we
   1514     // didn't actually do any partitioning.
   1515     PartitioningCompilation cPNo(&model, devices);
   1516     ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
   1517     ASSERT_EQ(cPNo.finish(), Result::NO_ERROR);
   1518     ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1519     ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
   1520 
   1521     // Test kPartitioningWithFallback.  We should attempt
   1522     // partitioning, reach the end of the partitioning process (so we
   1523     // have an unsuccessful execution plan), discover the dimensionless
   1524     // intermediate operand, then fallback to CPU with a SIMPLE plan, and
   1525     // finally return success.
   1526     // No need to compare the original model to the model from the plan -- we
   1527     // didn't actually do any partitioning.
   1528     PartitioningCompilation cPWithFallback(&model, devices);
   1529     ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR);
   1530     ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR);
   1531     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1532     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
   1533               DeviceManager::getCpuDevice());
   1534 
   1535     // Test kPartitioningWithoutFallback.  We should attempt
   1536     // partitioning, and fail.
   1537     PartitioningCompilation cPWithoutFallback(&model, devices);
   1538     ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
   1539     ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
   1540     ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize());
   1541     ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
   1542 }
   1543 
   1544 // Regression test for http://b/69166603:
   1545 //     "partitioned compilation and execution yields wrong results when model output is submodel input"
   1546 TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) {
   1547     PartitioningModel model;
   1548     uint32_t opnd0 = model.addFloatOperand();
   1549     uint32_t opnd1 = model.addFloatOperand();
   1550     uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
   1551     uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd2, opnd2);
   1552     model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 });
   1553     model.finish();
   1554     ASSERT_TRUE(model.isValid());
   1555 
   1556     // Compound partition (two devices, each is capable of one of the
   1557     // two operations).  We could do more extensive checking here --
   1558     // for example, verify that each step within the plan has the
   1559     // correct (model and submodel)x(inputs and outputs).
   1560     const auto devices = makeDevices({{"0", 0.5, 1 << 0}, {"1", 0.5, 1 << 1}});
   1561     ExecutionPlan plan;
   1562     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
   1563               ANEURALNETWORKS_NO_ERROR);
   1564     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
   1565     const auto& steps = plan.forTest_compoundGetSteps();
   1566     ASSERT_EQ(steps.size(), size_t(2));
   1567     {
   1568         // Build a model to compare against the submodel from steps[0].
   1569         PartitioningModel model0;
   1570         uint32_t m0Opnd0 = model0.addFloatOperand();
   1571         uint32_t m0Opnd1 = model0.addFloatOperand();
   1572         uint32_t m0Opnd2 = model0.addOperation2To1V1_0(0, m0Opnd0, m0Opnd1);
   1573         model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 });
   1574         model0.finish();
   1575         ASSERT_TRUE(model0.isValid());
   1576         ASSERT_NO_FATAL_FAILURE(
   1577                 compare(steps[0], &model0, devices[0],
   1578                         RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}},  // modelInputs
   1579                         RemapVectorType{{opnd2, m0Opnd2}},                    // modelOutputs
   1580                         RemapVectorType{},        // tempsAsSubModelInputs
   1581                         SubModelOutputSetType{},  // tempsAsSubModelOutputs
   1582                         RemapVectorType{}));      // outputsAsSubModelInputs
   1583     }
   1584     {
   1585         // Build a model to compare against the submodel from steps[1].
   1586         PartitioningModel model1;
   1587         uint32_t m1Opnd2 = model1.addFloatOperand();
   1588         uint32_t m1Opnd3 = model1.addOperation2To1V1_0(1, m1Opnd2, m1Opnd2);
   1589         model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 });
   1590         model1.finish();
   1591         ASSERT_TRUE(model1.isValid());
   1592 
   1593         ASSERT_NO_FATAL_FAILURE(
   1594                 compare(steps[1], &model1, devices[1], RemapVectorType{},  // modelInputs
   1595                         RemapVectorType{{opnd3, m1Opnd3}},                 // modelOutputs
   1596                         RemapVectorType{},                                 // tempsAsSubModelInputs
   1597                         SubModelOutputSetType{},                           // tempsAsSubModelOutputs
   1598                         RemapVectorType{{opnd2, m1Opnd2}}));  // outputsAsSubModelInputs
   1599     }
   1600 }
   1601 
   1602 TEST_F(PartitioningTest, OemOperations) {
   1603     // Trivial model consisting solely of OEM operation.
   1604     PartitioningModel model;
   1605     uint32_t opndIn = model.addFloatOperand();
   1606     uint32_t opndOut = model.addOperationOEM1To1(opndIn);
   1607     model.identifyInputsAndOutputs({ opndIn }, { opndOut });
   1608     model.finish();
   1609     ASSERT_TRUE(model.isValid());
   1610 
   1611     // Verify that the best driver than can run an OEM operation is
   1612     // used, even if it is not better than the CPU.
   1613     // No need to compare the original model to the model from the plan -- we
   1614     // didn't actually do any partitioning.
   1615     const auto devicesBestOEM = makeDevices({{"badOEM", 1.5, ~0U, PartitioningDriver::OEMYes},
   1616                                              {"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo},
   1617                                              {"goodOEM", 1.2, ~0U, PartitioningDriver::OEMYes}});
   1618     PartitioningCompilation compilationBestOEM(&model, devicesBestOEM);
   1619     ASSERT_EQ(compilationBestOEM.finish(), Result::NO_ERROR);
   1620     const auto& planBestOEM = compilationBestOEM.getExecutionPlan();
   1621     ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1622     ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr);
   1623     ASSERT_STREQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM");
   1624 
   1625     // Verify that we get an error if no driver can run an OEM operation.
   1626     const auto devicesNoOEM = makeDevices({{"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo}});
   1627     PartitioningCompilation compilationNoOEM(&model, devicesNoOEM);
   1628     ASSERT_EQ(compilationNoOEM.finish(), Result::BAD_DATA);
   1629 
   1630     // Verify that we get an error if a driver can SUPPORT but not PREPARE an OEM operation.
   1631     const auto devicesIndecisiveOEM =
   1632             makeDevices({{"indecisiveOEM", 0.5, ~0U, PartitioningDriver::OEMIndecisive}});
   1633     PartitioningCompilation compilationIndecisiveOEM(&model, devicesIndecisiveOEM);
   1634     ASSERT_NE(compilationIndecisiveOEM.finish(), Result::NO_ERROR);
   1635 
   1636     // Verify that we get an error if there are no drivers (only CPU fallback).
   1637     PartitioningCompilation compilationNoDrivers(&model, makeDevices({}) /* no drivers */);
   1638     ASSERT_EQ(compilationNoDrivers.finish(), Result::BAD_DATA);
   1639 }
   1640 
   1641 TEST_F(PartitioningTest, RelaxedFP) {
   1642     const auto devices = makeDevices({// Best choice for non-relaxed model.
   1643                                       {"f32", 0.8, 0.9 /* relaxed */, ~0U},
   1644                                       // Best choice for relaxed model.
   1645                                       {"f16", 0.9, 0.8 /* relaxed */, ~0U}});
   1646 
   1647     auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) {
   1648         // Trivial model consisting solely of one operation.
   1649         SCOPED_TRACE(expectDevice);
   1650         PartitioningModel model;
   1651         uint32_t opnd0 = model.addFloatOperand();
   1652         uint32_t opnd1 = model.addFloatOperand();
   1653         uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
   1654         model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 });
   1655         model.relaxComputationFloat32toFloat16(doRelax);
   1656         model.finish();
   1657         ASSERT_TRUE(model.isValid());
   1658         // Verify that the model will be executed on the appropriate device.
   1659         // No need to compare the original model to the model from the plan -- we
   1660         // didn't actually do any partitioning.
   1661         ExecutionPlan plan;
   1662         ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
   1663                   ANEURALNETWORKS_NO_ERROR);
   1664         ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1665         ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
   1666     };
   1667 
   1668     ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32"));
   1669     ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16"));
   1670 }
   1671 
   1672 TEST_F(PartitioningTest, Perf) {
   1673     // The various type names used here are confusing.
   1674     //
   1675     // OperandType (from HAL file), WrapperType (from NeuralNetworksWrapper.h),
   1676     // and OperandCode (from NeuralNetworks.h) are different enums representing
   1677     // the same type kind -- e.g., OperandType::FLOAT32, WrapperType::FLOAT32,
   1678     // ANEURALNETWORKS_FLOAT32.  Corresponding enumerators have the same value.
   1679     //
   1680     // WrapperOperandType is the NeuralNetworksWrapper.h representation of a
   1681     // full operand type (WrapperType plus dimensions plus other attributes).
   1682 
   1683     auto TestType = [](OperandType operandType) {
   1684         SCOPED_TRACE(toString(operandType));
   1685         // Trivial model consisting solely of OEM operation.  We
   1686         // pick OEM operation because this allows us to use
   1687         // inputs and outputs of any number and type.
   1688         PartitioningModel model;
   1689         uint32_t opndIn = model.addOperand(static_cast<WrapperType>(operandType));
   1690         uint32_t opndOut = model.addOperationOEM1To1(opndIn);
   1691         model.identifyInputsAndOutputs({opndIn}, {opndOut});
   1692         model.finish();
   1693         ASSERT_TRUE(model.isValid());
   1694 
   1695         const Capabilities baseCapabilities = makeCapabilities(0.5);
   1696 
   1697         {
   1698             // better than base
   1699             Capabilities goodCapabilities = baseCapabilities;
   1700             update(&goodCapabilities, operandType, 0.25);
   1701 
   1702             const auto devices =
   1703                     makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes},
   1704                                  {"good", goodCapabilities, ~0U, PartitioningDriver::OEMYes}});
   1705 
   1706             // Verify that model will be executed on "good".
   1707             // No need to compare the original model to the model from the plan -- we
   1708             // didn't actually do any partitioning.
   1709             ExecutionPlan plan;
   1710             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
   1711                       ANEURALNETWORKS_NO_ERROR);
   1712             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1713             ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "good");
   1714         }
   1715 
   1716         {
   1717             // worse than base
   1718             Capabilities badCapabilities = baseCapabilities;
   1719             update(&badCapabilities, operandType, 0.75);
   1720             const auto devices =
   1721                     makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes},
   1722                                  {"bad", badCapabilities, ~0U, PartitioningDriver::OEMYes}});
   1723 
   1724             // Verify that model will be executed on "base".
   1725             // No need to compare the original model to the model from the plan -- we
   1726             // didn't actually do any partitioning.
   1727             ExecutionPlan plan;
   1728             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
   1729                       ANEURALNETWORKS_NO_ERROR);
   1730             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
   1731             ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "base");
   1732         }
   1733     };
   1734 
   1735     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
   1736          type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
   1737         TestType(static_cast<OperandType>(type));
   1738     }
   1739     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
   1740          type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
   1741         TestType(static_cast<OperandType>(type));
   1742     }
   1743 }
   1744 
   1745 // Test token rehashing during the compilation step.
   1746 class CacheTest : public PartitioningTest {
   1747    protected:
   1748     virtual void SetUp() override {
   1749         PartitioningTest::SetUp();
   1750         char cacheDirTemp[] = "/data/local/tmp/TestCompilationCachingXXXXXX";
   1751         char* cacheDir = mkdtemp(cacheDirTemp);
   1752         ASSERT_NE(cacheDir, nullptr);
   1753         mCacheDir = cacheDir;
   1754     }
   1755 
   1756     virtual void TearDown() override {
   1757         if (!::testing::Test::HasFailure()) {
   1758             std::filesystem::remove_all(mCacheDir);
   1759         }
   1760         PartitioningTest::TearDown();
   1761     }
   1762 
   1763     void expectUniqueTokens(const std::vector<std::vector<uint8_t>>& tokens) {
   1764         for (uint32_t i = 0; i < tokens.size(); i++) {
   1765             SCOPED_TRACE(i);
   1766             for (uint32_t j = i + 1; j < tokens.size(); j++) {
   1767                 SCOPED_TRACE(j);
   1768                 EXPECT_NE(tokens[i], tokens[j]);
   1769             }
   1770         }
   1771     }
   1772 
   1773     // Launch a single run of the partitioner against the provided model and device list with
   1774     // cache token privided as tokenIn. Find the partition for the device with deviceName.
   1775     // Record the tranformed token into tokenOut.
   1776     // If tokenIn is empty, no caching information will be provided to the partitioner.
   1777     void getTransformedCacheTokenSingle(const PartitioningModel& model,
   1778                                         const std::vector<std::shared_ptr<Device>>& devices,
   1779                                         const char* deviceName, const std::vector<uint8_t>& tokenIn,
   1780                                         ExecutePreference preference,
   1781                                         std::vector<uint8_t>* tokenOut) {
   1782         // Compile the model and get the execution plan.
   1783         PartitioningCompilation compilation(&model, devices);
   1784         if (!tokenIn.empty()) {
   1785             compilation.setCaching(mCacheDir.c_str(), tokenIn);
   1786         }
   1787         compilation.setPreference(preference);
   1788         ASSERT_EQ(compilation.finish(), Result::NO_ERROR);
   1789         const ExecutionPlan& plan = compilation.getExecutionPlan();
   1790 
   1791         // Find the cache info for the device.
   1792         const uint8_t* token = nullptr;
   1793         if (plan.forTest_getKind() == ExecutionPlan::Kind::SIMPLE) {
   1794             ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), deviceName);
   1795             token = plan.forTest_simpleGetCacheToken();
   1796         } else if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) {
   1797             const auto& steps = plan.forTest_compoundGetSteps();
   1798             bool found = false;
   1799             for (const auto& step : steps) {
   1800                 // In general, two or more partitions can be on the same device. However, this will
   1801                 // not happen on the test models with only 2 operations.
   1802                 if (strcmp(step->getDevice()->getName(), deviceName) == 0) {
   1803                     ASSERT_FALSE(found);
   1804                     token = step->forTest_getCacheToken();
   1805                     found = true;
   1806                 }
   1807             }
   1808             ASSERT_TRUE(found);
   1809         } else {
   1810             FAIL();
   1811         }
   1812 
   1813         // Retrieve the transformed token from the cache info.
   1814         if (token == nullptr) {
   1815             tokenOut->clear();
   1816         } else {
   1817             tokenOut->resize(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN);
   1818             std::copy(token, token + ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, tokenOut->begin());
   1819         }
   1820     }
   1821 
   1822     // A wrapper of getTransformedCacheTokenSingle, which runs getTransformedCacheTokenSingle
   1823     // multiple times and checks if the transformation provides consistent result.
   1824     void getTransformedCacheToken(const PartitioningModel& model,
   1825                                   const std::vector<std::shared_ptr<Device>>& devices,
   1826                                   const char* deviceName, const std::vector<uint8_t>& tokenIn,
   1827                                   ExecutePreference preference, std::vector<uint8_t>* tokenOut) {
   1828         getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, tokenOut);
   1829 
   1830         // Test if the runtime maps to the same cache token every time for the same compilation
   1831         // setup.
   1832         for (uint32_t i = 0; i < 10; i++) {
   1833             std::vector<uint8_t> token;
   1834             SCOPED_TRACE(i);
   1835             getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, &token);
   1836             EXPECT_EQ(*tokenOut, token);
   1837         }
   1838     }
   1839 
   1840     void CreateModelForCachingTests(PartitioningModel* model) {
   1841         uint32_t opnd0 = model->addFloatOperand();
   1842         uint32_t opnd1 = model->addFloatOperand();
   1843         uint32_t opnd2 = model->addOperation2To1V1_0(0, opnd0, opnd1);
   1844         uint32_t opnd3 = model->addFloatOperand();
   1845         uint32_t opnd4 = model->addOperation2To1V1_0(1, opnd2, opnd3);
   1846         model->identifyInputsAndOutputs({opnd0, opnd1, opnd3}, {opnd4});
   1847         model->finish();
   1848         ASSERT_TRUE(model->isValid());
   1849     }
   1850 
   1851     std::string mCacheDir;
   1852 };
   1853 
   1854 // Test the case when no token is provided by the application and the execution plan has a
   1855 // simple body.
   1856 TEST_F(CacheTest, CacheTokenNoneSimpleBody) {
   1857     PartitioningModel model;
   1858     CreateModelForCachingTests(&model);
   1859 
   1860     // deviceA can execute the whole model.
   1861     const auto deviceA = makeDevices({
   1862             {"deviceA", 0.5, ~0U},
   1863     });
   1864 
   1865     std::vector<uint8_t> tokenIn, tokenOut;
   1866     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
   1867                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
   1868     EXPECT_TRUE(tokenOut.empty());
   1869 }
   1870 
   1871 // Test if the runtime maps to different cache tokens for devices with different names in
   1872 // execution plan with a simple body.
   1873 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesSimpleBody) {
   1874     PartitioningModel model;
   1875     CreateModelForCachingTests(&model);
   1876 
   1877     // Two devices that can both execute the whole model.
   1878     const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
   1879     const auto deviceB = makeDevices({{"deviceB", 0.5, ~0U}});
   1880 
   1881     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   1882     std::vector<uint8_t> deviceAToken, deviceBToken;
   1883     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
   1884                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken);
   1885     getTransformedCacheToken(model, deviceB, "deviceB", tokenIn,
   1886                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken);
   1887     expectUniqueTokens({deviceAToken, deviceBToken});
   1888 }
   1889 
   1890 // Test if the runtime maps to different cache tokens for devices with different version strings in
   1891 // execution plan with a simple body.
   1892 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsSimpleBody) {
   1893     PartitioningModel model;
   1894     CreateModelForCachingTests(&model);
   1895 
   1896     // Two devices that can both execute the whole model.
   1897     const auto deviceA_1_0 = makeDevices({{"deviceA", "1.0", 0.5, ~0U}});
   1898     const auto deviceA_1_1 = makeDevices({{"deviceA", "1.1", 0.5, ~0U}});
   1899 
   1900     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   1901     std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token;
   1902     getTransformedCacheToken(model, deviceA_1_0, "deviceA", tokenIn,
   1903                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token);
   1904     getTransformedCacheToken(model, deviceA_1_1, "deviceA", tokenIn,
   1905                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token);
   1906     expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token});
   1907 }
   1908 
   1909 // Test if the runtime maps to different cache tokens for compilations with different preferences
   1910 // in execution plan with a simple body.
   1911 TEST_F(CacheTest, CacheTokenDifferentPreferencesSimpleBody) {
   1912     PartitioningModel model;
   1913     CreateModelForCachingTests(&model);
   1914 
   1915     // One device that can execute the whole model.
   1916     const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
   1917 
   1918     std::vector<uint8_t> fastToken, powerToken, sustainedToken;
   1919     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   1920     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
   1921                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken);
   1922     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
   1923                              ExecutePreference::PREFER_LOW_POWER, &powerToken);
   1924     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
   1925                              ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken);
   1926     expectUniqueTokens({fastToken, powerToken, sustainedToken});
   1927 }
   1928 
   1929 // Test if the runtime maps to different cache tokens for compilations with different tokens
   1930 // provided by application in execution plan with a simple body.
   1931 TEST_F(CacheTest, CacheTokenDifferentTokensSimpleBody) {
   1932     PartitioningModel model;
   1933     CreateModelForCachingTests(&model);
   1934 
   1935     // One device that can execute the whole model.
   1936     const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
   1937 
   1938     std::vector<uint8_t> tokenOut1, tokenOut2;
   1939     std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   1940     std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1);
   1941     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn1,
   1942                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
   1943     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn2,
   1944                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
   1945     expectUniqueTokens({tokenOut1, tokenOut2});
   1946 }
   1947 
   1948 // Test the case when no token is provided by the application and the execution plan has a
   1949 // compound body.
   1950 TEST_F(CacheTest, CacheTokenNoneCompoundBody) {
   1951     PartitioningModel model;
   1952     CreateModelForCachingTests(&model);
   1953 
   1954     // DeviceA executes the first operation only.
   1955     const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
   1956 
   1957     std::vector<uint8_t> tokenIn, tokenOut;
   1958     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
   1959                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
   1960     EXPECT_TRUE(tokenOut.empty());
   1961     getTransformedCacheToken(model, devices, "deviceB", tokenIn,
   1962                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
   1963     EXPECT_TRUE(tokenOut.empty());
   1964 }
   1965 
   1966 // Test if the runtime maps to different cache tokens for devices with different names in
   1967 // execution plan with a compound body.
   1968 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesCompoundBody) {
   1969     PartitioningModel model;
   1970     CreateModelForCachingTests(&model);
   1971 
   1972     // DeviceA executes the first operation only.
   1973     const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}});
   1974     // DeviceB executes the first operation only.
   1975     const auto devices2 = makeDevices({{"deviceB", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}});
   1976 
   1977     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   1978     std::vector<uint8_t> deviceAToken, deviceBToken;
   1979     getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
   1980                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken);
   1981     getTransformedCacheToken(model, devices2, "deviceB", tokenIn,
   1982                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken);
   1983     expectUniqueTokens({deviceAToken, deviceBToken});
   1984 }
   1985 
   1986 // Test if the runtime maps to different cache tokens for devices with different names in
   1987 // execution plan with a compound body.
   1988 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsCompoundBody) {
   1989     PartitioningModel model;
   1990     CreateModelForCachingTests(&model);
   1991 
   1992     // DeviceA executes the first operation only.
   1993     const auto devices1 = makeDevices({{"deviceA", "1.0", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
   1994     // DeviceB executes the first operation only.
   1995     const auto devices2 = makeDevices({{"deviceA", "1.1", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
   1996 
   1997     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   1998     std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token;
   1999     getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
   2000                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token);
   2001     getTransformedCacheToken(model, devices2, "deviceA", tokenIn,
   2002                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token);
   2003     expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token});
   2004 }
   2005 
   2006 // Test if the runtime maps to different cache tokens for compilations with different preferences
   2007 // in execution plan with a compound body.
   2008 TEST_F(CacheTest, CacheTokenDifferentPreferencesCompoundBody) {
   2009     PartitioningModel model;
   2010     CreateModelForCachingTests(&model);
   2011 
   2012     // DeviceA executes the first operation only.
   2013     const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
   2014 
   2015     std::vector<uint8_t> fastToken, powerToken, sustainedToken;
   2016     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   2017     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
   2018                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken);
   2019     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
   2020                              ExecutePreference::PREFER_LOW_POWER, &powerToken);
   2021     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
   2022                              ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken);
   2023     expectUniqueTokens({fastToken, powerToken, sustainedToken});
   2024 }
   2025 
   2026 // Test if the runtime maps to different cache tokens for compilations with different tokens
   2027 // provided by application in execution plan with a compound body.
   2028 TEST_F(CacheTest, CacheTokenDifferentTokensCompoundBody) {
   2029     PartitioningModel model;
   2030     CreateModelForCachingTests(&model);
   2031 
   2032     // DeviceA executes the first operation only.
   2033     const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
   2034 
   2035     std::vector<uint8_t> tokenOut1, tokenOut2;
   2036     std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   2037     std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1);
   2038     getTransformedCacheToken(model, devices, "deviceA", tokenIn1,
   2039                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
   2040     getTransformedCacheToken(model, devices, "deviceA", tokenIn2,
   2041                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
   2042     expectUniqueTokens({tokenOut1, tokenOut2});
   2043 }
   2044 
   2045 // Test if the runtime maps to different cache tokens for compilations with different partitioning
   2046 // outcome in execution plan with a compound body.
   2047 TEST_F(CacheTest, CacheTokenDifferentPartitionsCompoundBody) {
   2048     PartitioningModel model;
   2049     CreateModelForCachingTests(&model);
   2050 
   2051     // DeviceA executes the whole model.
   2052     const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 0U}});
   2053     // DeviceA executes the first operation only.
   2054     const auto devices2 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
   2055     // DeviceA executes the second operation only.
   2056     const auto devices3 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 0}});
   2057 
   2058     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
   2059     std::vector<uint8_t> tokenOut1, tokenOut2, tokenOut3;
   2060     getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
   2061                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
   2062     getTransformedCacheToken(model, devices2, "deviceA", tokenIn,
   2063                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
   2064     getTransformedCacheToken(model, devices3, "deviceA", tokenIn,
   2065                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut3);
   2066     expectUniqueTokens({tokenOut1, tokenOut2, tokenOut3});
   2067 }
   2068 
   2069 // Very basic tests of some of the PerformanceInfo functionality.
   2070 // Placed in this file because partitioning is the consumer of this functionality.
   2071 class PerfTest : public ::testing::Test {};
   2072 
   2073 TEST_F(PerfTest, Lookup) {
   2074     // Derive an arbitrary (but reproducible) performance value from an OperandType.
   2075     // We'll use this to ensure that we can save and then recover a type's performance.
   2076     auto typePerf = [](OperandType type) { return float(static_cast<uint32_t>(type)); };
   2077 
   2078     Capabilities capabilities = makeCapabilities(-1.0f);
   2079 
   2080     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
   2081          type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
   2082         OperandType operandType = static_cast<OperandType>(type);
   2083         update(&capabilities, operandType, typePerf(operandType));
   2084     }
   2085     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
   2086          type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
   2087         OperandType operandType = static_cast<OperandType>(type);
   2088         update(&capabilities, operandType, typePerf(operandType));
   2089     }
   2090 
   2091     // Make sure lookup retrieves the values stored by update
   2092 
   2093     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
   2094          type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
   2095         OperandType operandType = static_cast<OperandType>(type);
   2096         SCOPED_TRACE(toString(operandType));
   2097         EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType));
   2098     }
   2099     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
   2100          type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
   2101         OperandType operandType = static_cast<OperandType>(type);
   2102         SCOPED_TRACE(toString(operandType));
   2103         EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType));
   2104     }
   2105 
   2106     // Check the behavior of a missing type
   2107 
   2108     OperandType operandType =
   2109             static_cast<OperandType>(static_cast<uint32_t>(OperandTypeRange::BASE_MAX) + 1);
   2110     EXPECT_EQ(lookupExecTime(capabilities, operandType), FLT_MAX);
   2111 }
   2112 
   2113 }  // namespace
   2114