runtime/test/TestPartitioningRandom.cpp

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#undef NDEBUG

#include "Bridge.h"
#include "CompilationBuilder.h"
#include "Manager.h"
#include "ModelBuilder.h"
#include "NeuralNetworks.h"
#include "NeuralNetworksWrapper.h"
#include "SampleDriver.h"
#include "Utils.h"
#include "ValidateHal.h"

#include <algorithm>
#include <cassert>
#include <cstdio>
#include <random>
#include <set>
#include <tuple>
#include <utility>
#include <vector>

#include <unistd.h>

#include <android-base/logging.h>
#include <android/sharedmem.h>
#include <gtest/gtest.h>

// Uncomment the following line to generate some debugging output that
// may be useful when analyzing failures:
//
// #define VERBOSE VERBOSE

// Uncomment the following line to generate graphs from models:
//
// #define GRAPH GRAPH

// We randomly generate tests (model + input data) at runtime, and verify
// that we get the same results whether we do partitioned compilation/execution
// or non partitioned compilation/execution.  We perform a test as follows:
//
// (1) Randomly generate a model (graph and weights), randomly generate input
//     data, randomly assign inputs and outputs to CPU memory or to shared
//     memory.
//
//     Randomly leaves dimensions unset for intermediate operands.
//
// (2) Randomly generate drivers based on the sample driver, each of which
//     executes models on the CPU.  They differ according to which operations
//     they support.
//
// (3) Compile and execute without partitioning, saving off the results.
//
// (4) Compile and execute with partitioning.
//
// (5) Verify that the saved results from (3) match the results from (4).
//
// For simplicity, all data (model inputs, model outputs, weights,
// temps) are of the same type: a 2-D TENSOR_FLOAT32 where the two
// dimensions are fixed throughout a particular test case (and
// randomly determined).  This prevents us from having to find a
// mechanism to "resize" data (e.g., if ADD#a operates on data of size
// 2x2, ADD#b operates on data of size 3x3, and the outputs of ADD#a
// and ADD#b become inputs of ADD#c, do we need to insert one or more
// operations between (say) ADD#a and ADD#c to convert ADD#2's data
// from size 2x2 to size 3x3 in order to match ADD#b).  In the few
// cases where an operand cannot be of this type, it is a constant
// (e.g., activation functions and RNN bias).
//
// Each operation we generate has a signature (described in more
// detail later).  The randomly generated drivers decide which
// operations they can execute by checking operation signatures.  Once
// we have built the model and know the set of signatures, we randomly
// assign each signature to a driver.  No signature is supported by
// multiple drivers -- we're not testing the logic that the
// partitioning algorithm uses to select the best driver for an
// operation.

namespace android {

using CompilationBuilder = nn::CompilationBuilder;
using Device = nn::Device;
using DeviceManager = nn::DeviceManager;
using ExecutionPlan = nn::ExecutionPlan;
using HidlModel = hardware::neuralnetworks::V1_1::Model;
using MemoryBuilder = nn::Memory;
using ModelBuilder = nn::ModelBuilder;
using Result = nn::wrapper::Result;
using SampleDriver = nn::sample_driver::SampleDriver;
using WrapperCompilation = nn::wrapper::Compilation;
using WrapperExecution = nn::wrapper::Execution;
using WrapperMemory = nn::wrapper::Memory;
using WrapperModel = nn::wrapper::Model;
using WrapperOperandType = nn::wrapper::OperandType;
using WrapperType = nn::wrapper::Type;

namespace {

/// Configure test size //////////////////////////////////////////////////////////

// We may exceed this in order to connect otherwise disjoint subgraphs.
static const unsigned kMaxNumOperations = 100;

// We build models to process 2-D square tensors up to this size in each dimension;
// note that the API promotes by-value weights larger than 128 to by-reference,
// so we want to ensure that we can pick both types that exceed and types that do
// not exceed this size.
static const unsigned kMaxProblemSize = 8;

// First seed for pseudorandom test generation.
static const unsigned kFirstSeed = 0;

// Number of test cases.
static const unsigned kNumTestCases = 225;

// Force all graph weights into a single pool (as we recommend to users)
// or allow them to be distributed across multiple pools (more stress
// on the partitioning algorithm and the rest of the runtime)?
// Forcing all graph weights into a single pool may be necessary to
// prevent large graphs from running up against http://b/70302693
// "NNAPI overuses (?) fds".
static const bool kAllWeightsInOnePool = false;

//////////////////////////////////////////////////////////////////////////////////

// The signature of an operation consists of the operation type (e.g.,
// ADD) and the activation function (use -1 in the case of an
// operation type for which the activation function is inapplicable).
typedef std::pair<ANeuralNetworksOperationType, int> Signature;

// This class adds some simple utilities on top of
// ::android::nn::wrapper::Model.  For example, it provides access to
// certain features from ModelBuilder that are not exposed by the base
// class (such as inputCount() and operation index).
class TestModel : public WrapperModel {
public:

    uint32_t addOperation(ANeuralNetworksOperationType type, const std::vector<uint32_t>& inputs,
                          const std::vector<uint32_t>& outputs) {
        const uint32_t operationIndex = operationCount();
        mOperations.push_back(outputs);
        WrapperModel::addOperation(type, inputs, outputs);
        return operationIndex;
    }

    uint32_t operationCount() const {
        return mOperations.size();
    }

    uint32_t inputCount() const {
        return builder()->inputCount();
    }
    uint32_t outputCount() const {
        return builder()->outputCount();
    }

    const std::vector<uint32_t>& getOperationOutputs(uint32_t index) const {
        assert(index < mOperations.size());
        return mOperations[index];
    }

    // All values are immediately copied into the model (we need to do
    // this ourselves in cases where the underlying NNAPI does not).
    void setOperandValue(uint32_t index, const std::vector<float>& value) {
        const size_t length = value.size() * sizeof(float);

        if (length <= ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES) {
            WrapperModel::setOperandValue(index, value.data(), length);
        } else {
            mOperandValues.push_back(value);
            WrapperModel::setOperandValue(index, mOperandValues.back().data(), length);
        }
    }

    void setOperandValue(uint32_t index, int32_t value) {
        assert(sizeof(value) <=  ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES);
        WrapperModel::setOperandValue(index, &value, sizeof(value));
    }

private:

    const ModelBuilder* builder() const {
        return reinterpret_cast<const ModelBuilder*>(getHandle());
    }

    // Representation of operations: vector index is operation number,
    // vector value is operation's output operands.
    std::vector<std::vector<uint32_t>> mOperations;

    // Large operand values -- not immediately copied into the
    // WrapperModel, so remembered here instead.
    std::vector<std::vector<float>> mOperandValues;
};

// This class adds some simple utilities on top of
// ::android::nn::wrapper::Compilation in order to provide access to
// certain features from CompilationBuilder that are not exposed by
// the base class.
class TestCompilation : public WrapperCompilation {
public:
    TestCompilation(const WrapperModel* model) : WrapperCompilation(model) {}

    Result setPartitioning(uint32_t partitioning) {
        return static_cast<Result>(builder()->setPartitioning(partitioning));
    }

    using WrapperCompilation::finish;
    Result finish(const std::vector<std::shared_ptr<Device>>& devices) {
        return static_cast<Result>(builder()->finish(devices));
    }

    const ExecutionPlan& getExecutionPlan() const {
        return builder()->forTest_getExecutionPlan();
    }

private:
    const CompilationBuilder* builder() const {
        return reinterpret_cast<const CompilationBuilder*>(getHandle());
    }
    CompilationBuilder* builder() {
        return reinterpret_cast<CompilationBuilder*>(getHandle());
    }
};

// This class is used to manage a collection of memory regions,
// disjoint windows onto a set of Memory instances, each of which is
// associated with a single shared memory region.  Each region and
// Memory instance is assigned a number.  The usage pattern is as
// follows:
// - Call addMemory() and addRegion() as many times as needed to
//   declare (but not define) Memory instances and declare region
//   instances.
// - Call layout() to define the Memory instances.
// - Call getRegion() as many times as needed to get the details
//   of memory regions (such as address, or Memory/offset/length).
// The Memory instances created by layout() are owned by the
// TestMemories instance, and are destroyed when the TestMemories
// instance is destroyed.
class TestMemories {
public:
    TestMemories() = default;
    ~TestMemories();

    TestMemories(const TestMemories&) = delete;
    TestMemories& operator=(const TestMemories&) = delete;

    unsigned addMemory() {
        assert(!mLayoutDone);
        mMemorySizes.push_back(0);
        return memoryCount() - 1;
    }
    unsigned memoryCount() const {
        return mMemorySizes.size();
    }

    unsigned addRegion(unsigned memoryIndex, uint32_t length) {
        assert(!mLayoutDone);
        assert(memoryIndex < memoryCount());
        uint32_t& memorySize = mMemorySizes[memoryIndex];
        auto desc = std::make_tuple(memoryIndex, (uint32_t)memorySize, length);
        mRegions.push_back(desc);
        memorySize += length;
        return regionCount() - 1;
    }
    unsigned regionCount() const {
        return mRegions.size();
    }

    void layout();

    void* getRegion(unsigned regionIndex,
                    const WrapperMemory** pMemory, uint32_t* pOffset, uint32_t* pLength) {
        assert(mLayoutDone);
        assert(regionIndex < regionCount());
        const auto& regionDescriptor = mRegions[regionIndex];
        const WrapperMemory* memory = &mMemorys[std::get<0>(regionDescriptor)];
        uint32_t offset = std::get<1>(regionDescriptor);
        uint32_t length = std::get<2>(regionDescriptor);

        uint8_t* buffer;
        if (reinterpret_cast<MemoryBuilder*>(memory->get())->getPointer(&buffer) !=
            ANEURALNETWORKS_NO_ERROR) {
            assert(0);
        }

        if (pMemory) *pMemory = memory;
        if (pOffset) *pOffset = offset;
        if (pLength) *pLength = length;

        return buffer + offset;
    }

    void* getRegion(unsigned regionIndex) {
        return getRegion(regionIndex, nullptr, nullptr, nullptr);
    }

private:
    // Index is the memory index; value is the size of the memory
    // (aggregate size of all regions in the memory).
    std::vector<uint32_t> mMemorySizes;

    // Index is the memory index.
    std::vector<WrapperMemory> mMemorys;
    std::vector<int> mFDs;

    // Index is the region index; tuple represents memory index,
    // region offset within memory, region length.
    std::vector<std::tuple<unsigned, uint32_t, uint32_t>> mRegions;

    // For sanity checking.
    bool mLayoutDone = false;
};

void TestMemories::layout() {
    assert(!mLayoutDone);
    for (uint32_t memorySize : mMemorySizes) {
        const int fd = ASharedMemory_create(nullptr, memorySize);
        assert(fd >= 0);
        mMemorys.emplace_back(memorySize, PROT_READ | PROT_WRITE, fd, 0);
        mFDs.push_back(fd);
    }
    mLayoutDone = true;
}

TestMemories::~TestMemories() {
    for (int fd : mFDs) {
        close(fd);
    }
}

class RandomPartitioningTest : public ::testing::TestWithParam<unsigned> {
public:
    RandomPartitioningTest() : mRandNumEng(GetParam() /* seed */), mRandNumUnitDist(0.0, 1.0) {}

    static Signature getSignature(const HidlModel& model, const Operation& operation);

protected:
    void graphDump(const WrapperModel& model);

    bool randBool() {
        return randUInt(2) == 1;
    }

    double randFrac() {  // [0.0, 1.0)
        return mRandNumUnitDist(mRandNumEng);
    }

    unsigned randUInt(unsigned limit) {  // [0, limit)
        return unsigned(randFrac() * limit);
    }

    // Represents an operation in which every input and output operand
    // is a TENSOR_FLOAT32 of dimensions [problemSize, problemSize] except:
    // - One input operand may be an activation function.
    // - Any number of input operands may be "special" in some other way
    //   (and in this implementation, not produced by any other operation).
    // We require that:
    // - There be at least one input operand that is neither an
    //    activation function nor "special".
    struct OperationPattern {
        int mOperationType;
        unsigned mNumInputs;
        unsigned mNumOutputs;
        int mActivationFunctionInputIndex;  // <0 if none

        // Returns operand index, or <0 if input is normal (must not
        // be called for an activation function operand).  Function
        // should have the following prototype:
        //
        //     int makeSpecialInput(unsigned problemSize, TestModel* model, unsigned inputIndex);
        //
        int (RandomPartitioningTest::*mMakeSpecialInput)(unsigned, TestModel*, unsigned);
    };

    static const OperationPattern kOperationPatterns[];

    int makeRnnSpecialInput(unsigned problemSize, TestModel* model, unsigned inputIndex) {
        if (inputIndex != 3) {
            return -1;
        }

        // input operand 3 is bias, a 1-D tensor
        const WrapperOperandType biasType(WrapperType::TENSOR_FLOAT32, { problemSize });
        const uint32_t operandIndex = model->addOperand(&biasType);
        std::vector<float> biasValue(problemSize);
        std::generate(biasValue.begin(), biasValue.end(),
                      [this]{ return randFrac(); });
        model->setOperandValue(operandIndex, biasValue);
        return int(operandIndex);
    }

#ifdef VERBOSE
    class ModelStats {
    public:
        ModelStats(const ModelBuilder* model) :
                mBuilder(model) { }
        ModelStats(const WrapperModel* model) :
                mBuilder(reinterpret_cast<const ModelBuilder*>(model->getHandle())) { }
        friend std::ostream& operator<<(std::ostream& out, const ModelStats& stats) {
            const uint32_t operandCount = stats.mBuilder->operandCount();
            const uint32_t inputCount = stats.mBuilder->inputCount();
            const uint32_t outputCount = stats.mBuilder->outputCount();
            out << "operationCount = " << stats.mBuilder->operationCount()
                << ", operandCount = " << operandCount
                << ", inputCount = " << inputCount
                << " (" << (double(inputCount) / operandCount) << ")"
                << ", outputCount = " << outputCount
                << " (" << (double(outputCount) / operandCount) << ")";
            return out;
        }
    private:
        const ModelBuilder* mBuilder;
    };
#endif

private:
    std::mt19937 mRandNumEng;
    std::uniform_real_distribution<double> mRandNumUnitDist;
};

const RandomPartitioningTest::OperationPattern RandomPartitioningTest::kOperationPatterns[] = {
    { ANEURALNETWORKS_ADD, 3, 1, 2, nullptr },
    { ANEURALNETWORKS_LOGISTIC, 1, 1, -1, nullptr },
    { ANEURALNETWORKS_MUL, 3, 1, 2, nullptr },
    { ANEURALNETWORKS_RNN, 6, 2, 5, &RandomPartitioningTest::makeRnnSpecialInput },
    { ANEURALNETWORKS_TANH, 1, 1, -1, nullptr },
};

Signature RandomPartitioningTest::getSignature(const HidlModel& model, const Operation& operation) {
    static const std::map<ANeuralNetworksOperationType, int> kOperationToActivation = []() {
        std::map<ANeuralNetworksOperationType, int> result;
        for (const auto& pattern : kOperationPatterns) {
            result[pattern.mOperationType] = pattern.mActivationFunctionInputIndex;
        }
        return result;
    }();

    const ANeuralNetworksOperationType operationType =
            static_cast<ANeuralNetworksOperationType>(operation.type);
    const int activationFunctionInputIndex = kOperationToActivation.at(operationType);
    if (activationFunctionInputIndex < 0) {
        return Signature(operationType, -1);
    }

    const Operand& operand = model.operands[operation.inputs[activationFunctionInputIndex]];
    assert(operand.lifetime == OperandLifeTime::CONSTANT_COPY);
    assert(operand.type == OperandType::INT32);
    int32_t value;
    memcpy(&value,
           &model.operandValues[operand.location.offset],
           operand.location.length);
    return Signature(operationType, value);
}

void RandomPartitioningTest::graphDump([[maybe_unused]] const WrapperModel& model) {
#ifdef GRAPH
    const std::string name = "Test-" + std::to_string(GetParam());
    nn::bridge_tests::graphDump(name.c_str(),
                                reinterpret_cast<const ModelBuilder*>(model.getHandle()));
#endif
}

class TestDriver : public SampleDriver {
public:
    // Behaves like SampleDriver, except that it only supports
    // operations with the specified signatures.
    TestDriver(const char* name, std::set<Signature> signatures) :
            SampleDriver(name), mSignatures(std::move(signatures)) { }

    Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override {
        android::nn::initVLogMask();
        Capabilities capabilities =
                {.float32Performance = {.execTime = 0.75f, .powerUsage = 0.75f},
                 .quantized8Performance = {.execTime = 0.75f, .powerUsage = 0.75f},
                 .relaxedFloat32toFloat16Performance = {.execTime = 0.75f, .powerUsage = 0.75f}};
        _hidl_cb(ErrorStatus::NONE, capabilities);
        return Void();
    }

    Return<void> getSupportedOperations_1_1(const HidlModel& model,
                                            getSupportedOperations_cb cb) override {
        if (nn::validateModel(model)) {
            const size_t count = model.operations.size();
            std::vector<bool> supported(count);
            for (size_t i = 0; i < count; i++) {
                supported[i] =
                    (mSignatures.count(
                        RandomPartitioningTest::getSignature(
                            model,
                            model.operations[i])) != 0);
            }
            cb(ErrorStatus::NONE, supported);
        } else {
            std::vector<bool> supported;
            cb(ErrorStatus::INVALID_ARGUMENT, supported);
        }
        return Void();
    }

    Return<ErrorStatus> prepareModel_1_1(const HidlModel& model, ExecutionPreference preference,
                                         const sp<IPreparedModelCallback>& callback) override {
        // NOTE: We verify that all operations in the model are supported.
        ErrorStatus outStatus = ErrorStatus::INVALID_ARGUMENT;
        auto ret = getSupportedOperations_1_1(
            model,
            [&outStatus](ErrorStatus inStatus, const hidl_vec<bool>& supportedOperations) {
                if (inStatus == ErrorStatus::NONE) {
                    if (std::all_of(supportedOperations.begin(), supportedOperations.end(),
                                    [](bool v){ return v; })) {
                        outStatus = ErrorStatus::NONE;
                    }
                }
            });
        if (ret.isOk() && (outStatus == ErrorStatus::NONE)) {
            return SampleDriver::prepareModel_1_1(model, preference, callback);
        } else {
            callback->notify(ErrorStatus::INVALID_ARGUMENT, nullptr);
            return ErrorStatus::INVALID_ARGUMENT;
        }
    }

private:
    const std::set<Signature> mSignatures;
};

INSTANTIATE_TEST_CASE_P(Seed, RandomPartitioningTest,
                        ::testing::Range(kFirstSeed, kFirstSeed + kNumTestCases));

TEST_P(RandomPartitioningTest, Test) {
    LOG(INFO) << "RandomPartitioningTest: GetParam() = " << GetParam();

#ifdef VERBOSE
    std::cout << std::setprecision(2) << std::fixed << std::setw(4);
#endif

    const unsigned problemSize = 1+randUInt(kMaxProblemSize);
    const WrapperOperandType problemType(WrapperType::TENSOR_FLOAT32, { problemSize, problemSize });
    const WrapperOperandType unknownDimensionsType(WrapperType::TENSOR_FLOAT32, { 0, 0 });

    static const WrapperOperandType activationFunctionType(WrapperType::INT32, { });

    const unsigned numOperations = 2+randUInt(kMaxNumOperations-1);
    const bool allowDeadOperations = (randFrac() < 0.2);
    const bool allowUnknownDimensions = (randFrac() < 0.25);

    // TODO: The current algorithm builds the graph in a forward
    // direction (i.e., later-generated operations consume outputs
    // from earlier-generated operations).  In order to get more
    // variation in graph topology, perhaps we should also create an
    // algorithm to build the graph in a backward direction (i.e.,
    // later-generated operations produce outputs to be consumed by
    // earlier-generated operations).
    [[maybe_unused]] const bool buildForward = randBool();

    // TODO: Add a form of forced connectivity that operates by
    // joining disjoint subgraphs rather than by forcing a root.
    const bool forceCommonRoot = (randFrac() < 0.75);

    TestModel model;
    std::vector<uint32_t> modelInputs;
    std::vector<uint32_t> modelOutputs;

    // Each region in weights is a problem-sized 2-D TENSOR_FLOAT32.
    TestMemories weights;

    // Keep track of all normal (i.e., not activation function and not
    // "special") operands that are values (from setOperandValue*()).
    // .first: operand index
    // .second: if the operand is already defined (via setOperandValue*()) then ~0U;
    //          otherwise, the operand has yet to be defined, and this is the corresponding
    //          region index in "weights"
    std::vector<std::pair<uint32_t, unsigned>> valueOperands;

    // An operand is "dead" if it is not consumed by another operation
    // and is not a model output.  Key is operand index; value is
    // operation index.
    std::map<uint32_t, uint32_t> deadOperands;

    // An operation is "dead" if all of its outputs are dead.
    std::set<uint32_t> deadOperations;

    // Collect the signatures of operations in this model.
    std::set<Signature> signatures;

    // For reporting purposes, keep track of the number of root
    // operations (those that do not consume results produced by other
    // operations).
    unsigned rootOperationCount = 0;

    // Track if we added operands with unknown dimensions. In this case,
    // partitioned compilation will fail if such an operand is read in a
    // different partition than it is written.
    bool hasUnknownDimensions = false;

    // Generate operations.
    for (unsigned i = 0; i < numOperations; i++) {
        const unsigned operationPatternIndex =
                randUInt(sizeof(kOperationPatterns)/sizeof(kOperationPatterns[0]));
        const auto& operationPattern = kOperationPatterns[operationPatternIndex];

        // INPUTS //////////////////////////////////////////////////////////////////////////////////

        std::vector<uint32_t> operationInputs(operationPattern.mNumInputs, ~0U);

        // First, process activation function and special inputs, and
        // keep track of which inputs remain.
        std::vector<uint32_t> normalOperationInputIndexes;
        int32_t activationFunction = -1;
        for (unsigned operationInputIndex = 0; operationInputIndex < operationPattern.mNumInputs;
             operationInputIndex++) {
            if (int(operationInputIndex) == operationPattern.mActivationFunctionInputIndex) {
                const uint32_t operandIndex = model.addOperand(&activationFunctionType);
                activationFunction = randUInt(4);
                if (activationFunction == ANEURALNETWORKS_FUSED_RELU1) {
                    // workaround for http://b/69011131
                    activationFunction = ANEURALNETWORKS_FUSED_NONE;
                }
                model.setOperandValue(operandIndex, activationFunction);
                operationInputs[operationInputIndex] = operandIndex;
                continue;
            }
            if (operationPattern.mMakeSpecialInput != nullptr) {
                const int operandIndex = (this->*(operationPattern.mMakeSpecialInput))(
                    problemSize, &model, operationInputIndex);
                if (operandIndex >= 0) {
                    operationInputs[operationInputIndex] = operandIndex;
                    continue;
                }
            }
            normalOperationInputIndexes.push_back(operationInputIndex);
        }
        assert(!normalOperationInputIndexes.empty());
        signatures.insert(Signature(operationPattern.mOperationType, activationFunction));

        // A (normal) operation input can be one of:
        // - a new or existing model input
        // - an output of an existing operation
        // - an OperandValue
        // - an OperandValueFromMemory
        // Some guidelines:
        // - We generally don't want all of an operation's inputs to be values (constants)
        const unsigned normalOperationInputCount = normalOperationInputIndexes.size();
        //     How many of this operation's inputs are constants?
        unsigned normalOperationInputConstantCount = 0;
        //     How many of this operation's inputs are model inputs?
        unsigned normalOperationInputModelInputCount = 0;
        // We begin by deciding what kind of input each (normal) operation will be; we don't
        // actually pick input operand indexes at this time, because we might override this
        // decision later.
        enum InputKind { IK_MODEL_INPUT, IK_OPERATION_OUTPUT, IK_VALUE };
        std::vector<InputKind> normalOperationInputKinds(normalOperationInputCount);
        std::generate(normalOperationInputKinds.begin(), normalOperationInputKinds.end(),
                      [this, &model,
                       numOperations,
                       normalOperationInputCount,
                       &normalOperationInputConstantCount,
                       &normalOperationInputModelInputCount]() -> InputKind {
                          // Constant?  Becomes less likely the more
                          // constants we already have as inputs to
                          // this operation.
                          if (randFrac() < 0.3 * (1 - double(normalOperationInputConstantCount) /
                                                   normalOperationInputCount)) {
                              normalOperationInputConstantCount++;
                              return IK_VALUE;
                          }

                          // Model input?  Becomes less likely the
                          // more model inputs we already have as
                          // inputs to this operation, and the further
                          // along we are in generating this model
                          // (i.e., the more operations we have
                          // generated).
                          if ((model.operationCount() == 0) ||
                              (randFrac() < 0.5 *
                               (1 - double(normalOperationInputModelInputCount) /
                                normalOperationInputCount) *
                               std::min(0.3, (1 - double(model.operationCount()) /
                                              numOperations)))) {
                              normalOperationInputModelInputCount++;
                              return IK_MODEL_INPUT;
                          }

                          // Else output of an existing operation.
                          return IK_OPERATION_OUTPUT;
                      });

        // Now force common root or model input, if necessary.  (A
        // model must have at least one input.)
        auto force =
                [this, &normalOperationInputKinds, normalOperationInputCount](InputKind forceKind){
            if (std::none_of(normalOperationInputKinds.begin(),
                             normalOperationInputKinds.end(),
                             [forceKind](InputKind kind){ return kind == forceKind; })) {
                normalOperationInputKinds[randUInt(normalOperationInputCount)] = forceKind;
            }
        };
        if (forceCommonRoot && (model.operationCount() != 0)) {
            force(IK_OPERATION_OUTPUT);
        }
        if (modelInputs.empty()) {
            assert(model.operationCount() == 0);
            force(IK_MODEL_INPUT);
        }

        // Finally create the normal inputs.
        bool isRootOperation = true;
        for (unsigned i = 0; i < normalOperationInputCount; i++) {
            uint32_t operandIndex = ~0U;
            switch (normalOperationInputKinds[i]) {
                case IK_MODEL_INPUT: {
                    if (!modelInputs.empty() && (randFrac() < 0.5)) {
                        operandIndex = modelInputs[randUInt(modelInputs.size())];
                    } else {
                        operandIndex = model.addOperand(&problemType);
                        modelInputs.push_back(operandIndex);
                    }
                    break;
                }
                case IK_OPERATION_OUTPUT: {
                    decltype(deadOperands.begin()) deadOperandI;
                    if (!deadOperands.empty() && (randFrac() < 0.5)) {
                        deadOperandI = deadOperands.begin();
                        std::advance(deadOperandI, randUInt(deadOperands.size()));
                        operandIndex = deadOperandI->first;
                    } else {
                        const uint32_t existingOperationIndex = randUInt(model.operationCount());
                        const auto& existingOperationOutputs =
                                model.getOperationOutputs(existingOperationIndex);
                        operandIndex =
                            existingOperationOutputs[randUInt(existingOperationOutputs.size())];
                        deadOperandI = deadOperands.find(operandIndex);
                        assert(deadOperandI == deadOperands.end() ||
                               deadOperandI->second == existingOperationIndex);
                    }
                    if (deadOperandI != deadOperands.end()) {
                        const uint32_t correspondingOperation = deadOperandI->second;
                        deadOperands.erase(deadOperandI);

                        auto deadOperationI = deadOperations.find(correspondingOperation);
                        if (deadOperationI != deadOperations.end()) {
                            deadOperations.erase(deadOperationI);
                        }
                    }
                    isRootOperation = false;
                    break;
                }
                case IK_VALUE: {
                    if (!valueOperands.empty() && (randFrac() < 0.25)) {
                        operandIndex = valueOperands[randUInt(valueOperands.size())].first;
                    } else {
                        operandIndex = model.addOperand(&problemType);
                        if (randFrac() < 0.5) {
                            std::vector<float> value(problemSize * problemSize);
                            std::generate(value.begin(), value.end(), [this]{ return randFrac(); });
                            model.setOperandValue(operandIndex, value);
                            valueOperands.push_back(std::make_pair(operandIndex, ~0U));
                        } else {
                            unsigned memoryIndex = ~0U;
                            if ((weights.memoryCount() != 0) &&
                                (kAllWeightsInOnePool || (randFrac() < 0.5))) {
                                memoryIndex = randUInt(weights.memoryCount());
                            } else {
                                memoryIndex = weights.addMemory();
                            }
                            const size_t length = problemSize * problemSize * sizeof(float);
                            const unsigned regionIndex = weights.addRegion(memoryIndex, length);
                            valueOperands.push_back(std::make_pair(operandIndex, regionIndex));
                        }
                    }
                    break;
                }
                default:
                    FAIL();
            }
            operationInputs[normalOperationInputIndexes[i]] = operandIndex;
        }
        if (isRootOperation) {
            rootOperationCount++;
        }

        // OUTPUTS /////////////////////////////////////////////////////////////////////////////////

        std::vector<uint32_t> operationOutputs(operationPattern.mNumOutputs);
        std::generate(operationOutputs.begin(), operationOutputs.end(),
                      [&model, &problemType, &unknownDimensionsType, &hasUnknownDimensions,
                       allowUnknownDimensions, this]{
                          // 3% unknowns causes ~35% of partitionings to fail
                          // (determined by commenting out the fallback code,
                          // running tests and noting number of failures).
                          if (allowUnknownDimensions && randFrac() < 0.03) {
                              hasUnknownDimensions = true;
                              return model.addOperand(&unknownDimensionsType);
                          } else {
                              return model.addOperand(&problemType);
                          }
                      });

        // OPERATION ///////////////////////////////////////////////////////////////////////////////

        const uint32_t operationIndex =
                model.addOperation(operationPattern.mOperationType,
                                   operationInputs, operationOutputs);
        deadOperations.insert(operationIndex);
        std::for_each(operationOutputs.begin(), operationOutputs.end(),
                      [&deadOperands, operationIndex](uint32_t operandIndex) {
                          deadOperands.insert(std::make_pair(operandIndex, operationIndex));
                      });
    }

    // Now finalize the weights.
    weights.layout();
    for (const auto& valueOperand : valueOperands) {
        const uint32_t operandIndex = valueOperand.first;
        const unsigned regionIndex = valueOperand.second;

        if (regionIndex == ~0U) {
            continue;
        }

        const WrapperMemory* memory;
        uint32_t offset, length;
        float* region =
                static_cast<float*>(weights.getRegion(regionIndex, &memory, &offset, &length));
        assert(length == problemSize * problemSize * sizeof(float));
        std::generate(region, region + problemSize * problemSize, [this]{ return randFrac(); });
        model.setOperandValueFromMemory(operandIndex, memory, offset, length);
    }

    // Now select model outputs.
    for (uint32_t operationIdx = 0, operationCount = model.operationCount();
         operationIdx < operationCount; operationIdx++) {
        const auto& outputs = model.getOperationOutputs(operationIdx);
        for (uint32_t outputIdx = 0, outputCount = outputs.size(); outputIdx < outputCount;
             outputIdx++) {
            bool modelOutput = false;
            const uint32_t operandIndex = outputs[outputIdx];
            const auto deadOperandI = deadOperands.find(operandIndex);
            if (deadOperandI != deadOperands.end()) {
                // This is not consumed within the model, so unless we
                // make it an output of the model, it's dead.  The
                // further along we are in generating this model
                // (i.e., the more operations we have generated), the
                // more likely we are to classify this operation
                // output as a model output.
                const double probabilityOfModelOutput =
                        0.50 * [](double x){ return x*x; }((operationIdx + 1) / operationCount);
                modelOutput = (randFrac() < probabilityOfModelOutput);
            } else {
                // This is consumed within the model, so we'll rarely
                // make it an output of the model.
                modelOutput = (randFrac() < 0.05);
            }
            if (!modelOutput) {
                continue;
            }
            modelOutputs.push_back(operandIndex);
            if (deadOperandI != deadOperands.end()) {
                deadOperands.erase(deadOperandI);
                const auto deadOperationI = deadOperations.find(operationIdx);
                if (deadOperationI != deadOperations.end()) {
                    deadOperations.erase(deadOperationI);
                }
            }
        }
    }
    if (!allowDeadOperations) {
        // For each dead operation, pick a random output to become a model output.
        for (uint32_t deadOperationIndex : deadOperations) {
            const auto& deadOperationOutputs = model.getOperationOutputs(deadOperationIndex);
            const uint32_t deadOperandIndex =
                    deadOperationOutputs[randUInt(deadOperationOutputs.size())];
            modelOutputs.push_back(deadOperandIndex);
        }
    }
    // A model must have at least one output.
    if (modelOutputs.empty()) {
        const auto& outputs = model.getOperationOutputs(randUInt(model.operationCount()));
        modelOutputs.push_back(outputs[randUInt(outputs.size())]);
    }

    model.identifyInputsAndOutputs(modelInputs, modelOutputs);
#ifdef VERBOSE
    {
        std::cout << "Original model: " << ModelStats(&model) << std::endl;
        std::cout << "rootOperationCount = " << rootOperationCount
                  << ", deadOperations = ";
        if (allowDeadOperations) {
            std::cout << deadOperations.size();
        } else {
            std::cout << "forbidden (converted " << deadOperations.size() << ")";
        }
        std::cout << std::endl;
    }
#endif
    ASSERT_EQ(model.finish(), Result::NO_ERROR);
    graphDump(model);

    // Non-partitioned compilation.
    TestCompilation c(&model);
    ASSERT_EQ(c.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
    ASSERT_EQ(c.finish(), Result::NO_ERROR);

    // Create some drivers for partitioned compilation.
    assert(!signatures.empty());
    std::vector<std::set<Signature>> signaturesForDriver(signatures.size());
    //     First assign each signature to a random driver (a driver is
    //     just represented as an entry in the signaturesForDriver
    //     vector).
    for (Signature signature : signatures) {
        signaturesForDriver[randUInt(signatures.size())].insert(signature);
    }
    //     Now remove each entry that has no signatures.
    auto firstExtra =
        std::remove_if(signaturesForDriver.begin(), signaturesForDriver.end(),
                       [](const std::set<Signature>& sigSet) { return sigSet.empty(); });
    if (firstExtra != signaturesForDriver.end()) {
        signaturesForDriver.erase(firstExtra, signaturesForDriver.end());
    }
    //     Now actually create the drivers.
    std::vector<std::shared_ptr<Device>> devices;
    for (unsigned i = 0; i < signaturesForDriver.size(); i++) {
        const std::string name = "TestDriver(" + std::to_string(i) + ")";
        devices.push_back(std::make_shared<Device>(
            name, new TestDriver(name.c_str(), signaturesForDriver[i])));
        ASSERT_TRUE(devices.back()->initialize());
    }

    // Partitioned compilation.
    // For test cases without unknown intermediate operand sizes we require the
    // partitioning to succeed without CPU fallback. With unknown sizes we
    // retry with a fallback if the non-fallback partitioning fails and require
    // the fallback to succeed.
    TestCompilation cNoFallback(&model);
    TestCompilation cWithFallback(&model);
    TestCompilation *c2 = nullptr;
    ASSERT_EQ(cNoFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
              Result::NO_ERROR);
    auto compilationResult = cNoFallback.finish(devices);
    if (hasUnknownDimensions && compilationResult == Result::OP_FAILED &&
        cNoFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize()) {
        ASSERT_EQ(cWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
                  Result::NO_ERROR);
        ASSERT_EQ(cWithFallback.finish(devices), Result::NO_ERROR);
        c2 = &cWithFallback;
    } else {
        ASSERT_EQ(compilationResult, Result::NO_ERROR);
        c2 = &cNoFallback;
    }

#ifdef VERBOSE
    {
        std::cout << "signatures = " << signatures.size()
                  << ", devices = " << devices.size() << std::endl;
        const ExecutionPlan& plan = c2->getExecutionPlan();
        switch (plan.forTest_getKind()) {
            case ExecutionPlan::Kind::SIMPLE:
                std::cout << "plan: simple" << std::endl;
                break;
            case ExecutionPlan::Kind::COMPOUND: {
                const auto& steps = plan.forTest_compoundGetSteps();
                std::set<const Device*> devicesInPlan;
                for (const auto& step : steps) {
                    devicesInPlan.insert(step->getDevice().get());
                }
                std::cout << "plan: compound, " << steps.size() << " steps over "
                          << devicesInPlan.size() << " devices" << std::endl;
                for (unsigned i = 0; i < steps.size(); i++) {
                    std::cout << "Step " << i << ": "
                              << ModelStats(steps[i]->getSubModel()) << std::endl;
                }
                break;
            }
            default:
                std::cout << "Unexpected plan kind: "
                    << static_cast<unsigned>(plan.forTest_getKind());
                break;
        }
    }
#endif

    // For execution:
    // - create master inputs (one long vector) and master output value
    //   - master inputs will be copied to actual inputs before each
    //     of the two executions
    //   - master output will be used to fill actual outputs before each
    //     of the two executions
    // - create actual inputs and outputs
    // - first execution (non-partitioned)
    //   - initialize inputs and (to avoid unrelated oddities) outputs
    //   - execute
    //   - copy outputs to a save area (one long vector)
    // - second execution (partitioned)
    //   - (to avoid unrelated oddities) initialize inputs and outputs
    //   - execute
    //   - compare outputs to save area

    // If the runtime and drivers are working properly, execution
    // should not change the inputs.  Nonetheless, we reinitialize the
    // inputs for each execution, so as to avoid unrelated problems
    // appearing to be problems related to unpartitioned execution
    // versus partitioned execution.  Similarly, execution behavior
    // should not be dependent on the outputs; but we'll initialize the
    // outputs anyway.
    std::vector<float> masterInputs(problemSize * problemSize * model.inputCount());
    std::generate(masterInputs.begin(), masterInputs.end(), [this]{ return randFrac(); });
    const float masterOutput = randFrac();

    // Create the memory for the actual inputs and outputs.
    struct InputOutputDescriptor {
        enum Kind { INPUT, OUTPUT };
        Kind mKind;

        // The input or output either resides in a local buffer
        // (mVector, in which case mMemoryRegion is ignored); or in a
        // shared memory region within a TestMemories instance
        // (mMemoryRegion, in which case mVector is ignored).
        enum Location { VECTOR, REGION };
        Location getLocation() const { return !mVector.empty() ? VECTOR : REGION; }

        std::vector<float> mVector;
        unsigned mMemoryRegion;
    };
    std::vector<InputOutputDescriptor> ioDescriptors(model.inputCount() + model.outputCount());
    for (unsigned i = 0; i < ioDescriptors.size(); i++) {
        ioDescriptors[i].mKind = (i < model.inputCount()
                                  ? InputOutputDescriptor::INPUT
                                  : InputOutputDescriptor::OUTPUT);
    }
    //     We randomly interleave inputs and outputs in creation
    //     order, because when we we create memory regions in a
    //     TestMemories instance, the order in which regions are
    //     created within a single Memory is the order they'll be laid
    //     out in that memory; and when we have inputs and outputs
    //     within the same Memory, we want the possibility that
    //     they'll be interleaved.
    std::random_shuffle(ioDescriptors.begin(), ioDescriptors.end(),
                        [this](unsigned n) { return randUInt(n); });
    TestMemories ioMemories;
    for (auto &desc : ioDescriptors) {
        if (randFrac() < 0.5) {
            desc.mVector.resize(problemSize * problemSize);
        } else {
            // TODO: common this with the way we create IK_VALUE inputs?
            unsigned memoryIndex = ~0U;
            if ((ioMemories.memoryCount() != 0) && (randFrac() < 0.5)) {
                memoryIndex = randUInt(ioMemories.memoryCount());
            } else {
                memoryIndex = ioMemories.addMemory();
            }
            const size_t length = problemSize * problemSize * sizeof(float);
            desc.mMemoryRegion = ioMemories.addRegion(memoryIndex, length);
        }
    }
    ioMemories.layout();

    // Function to set up actual inputs and outputs (initializing them
    // and telling the WrapperExecution about them).
    auto prepareForExecution =
            [&model, &ioDescriptors, &ioMemories,
             &masterInputs, &masterOutput, problemSize, &problemType](WrapperExecution *e) {
        uint32_t inputIndex = 0, outputIndex = 0;
        for (auto &desc : ioDescriptors) {
            if (desc.getLocation() == InputOutputDescriptor::VECTOR) {
                if (desc.mKind == InputOutputDescriptor::INPUT) {
                    const size_t inputOffset = inputIndex * problemSize * problemSize;
                    std::copy(masterInputs.begin() + inputOffset,
                              masterInputs.begin() + inputOffset + problemSize * problemSize,
                              desc.mVector.begin());
                    e->setInput(inputIndex++, desc.mVector.data(),
                                desc.mVector.size() * sizeof(float));
                } else {
                    std::fill(desc.mVector.begin(),
                              desc.mVector.begin() + problemSize * problemSize,
                              masterOutput);
                    e->setOutput(outputIndex++, desc.mVector.data(),
                                 desc.mVector.size() * sizeof(float),
                                 &problemType.operandType);
                }
            } else {
                const WrapperMemory* memory;
                uint32_t offset, length;
                float* region =
                        static_cast<float*>(ioMemories.getRegion(desc.mMemoryRegion,
                                                                 &memory, &offset, &length));
                assert(length == problemSize * problemSize * sizeof(float));
                if (desc.mKind == InputOutputDescriptor::INPUT) {
                    const size_t inputOffset = inputIndex * problemSize * problemSize;
                    std::copy(masterInputs.begin() + inputOffset,
                              masterInputs.begin() + inputOffset + problemSize * problemSize,
                              region);
                    e->setInputFromMemory(inputIndex++, memory, offset, length);
                } else {
                    std::fill(region,
                              region + problemSize * problemSize,
                              masterOutput);
                    e->setOutputFromMemory(outputIndex++, memory, offset, length,
                                           &problemType.operandType);
                }
            }
        };
        assert(inputIndex == model.inputCount());
        assert(outputIndex == model.outputCount());
    };

    // Non-partitioned execution.
    WrapperExecution e(&c);
    ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e));
    ASSERT_EQ(e.compute(), Result::NO_ERROR);

    // Copy the outputs of the non-partitioned execution to a save area.
    std::vector<float> nonPartitionedOutputs(problemSize * problemSize * model.outputCount());
    {
        uint32_t outputIndex = 0;
        for (const auto& desc : ioDescriptors) {
            if (desc.mKind != InputOutputDescriptor::OUTPUT) {
                continue;
            }
            const size_t outputOffset = outputIndex * problemSize * problemSize;
            if (desc.getLocation() == InputOutputDescriptor::VECTOR) {
                std::copy(desc.mVector.begin(),
                          desc.mVector.end(),
                          nonPartitionedOutputs.begin() + outputOffset);
            } else {
                float* region = static_cast<float*>(ioMemories.getRegion(desc.mMemoryRegion));
                std::copy(region,
                          region + problemSize * problemSize,
                          nonPartitionedOutputs.begin() + outputOffset);
            }
#ifdef VERBOSE
            {
                std::cout << "output[" << outputIndex << "] = {";
                for (auto I = nonPartitionedOutputs.begin() + outputOffset,
                             E = nonPartitionedOutputs.begin() +
                                     outputOffset + problemSize * problemSize;
                     I != E; I++) {
                    std::cout << " " << *I;
                }
                std::cout << " }" << std::endl;
            }
#endif
            outputIndex++;
        }
    }

    // Partitioned execution.
    WrapperExecution e2(c2);
    ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e2));
    ASSERT_EQ(e2.compute(), Result::NO_ERROR);

    // Compare the outputs of the partitioned execution to the save
    // area containing the outpus of the non-partitioned execution.
    {
        uint32_t outputIndex = 0;
        for (const auto& desc : ioDescriptors) {
            if (desc.mKind != InputOutputDescriptor::OUTPUT) {
                continue;
            }
            SCOPED_TRACE(outputIndex);
            const size_t outputOffset = outputIndex * problemSize * problemSize;
            if (desc.getLocation() == InputOutputDescriptor::VECTOR) {
                ASSERT_TRUE(std::equal(desc.mVector.begin(),
                                       desc.mVector.end(),
                                       nonPartitionedOutputs.begin() + outputOffset));
            } else {
                float* region = static_cast<float*>(ioMemories.getRegion(desc.mMemoryRegion));
                ASSERT_TRUE(std::equal(region,
                                       region + problemSize * problemSize,
                                       nonPartitionedOutputs.begin() + outputOffset));
            }
            outputIndex++;
        }
    }
}

}  // namespace
}  // namespace android