1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "CompilationBuilder.h" 18 #include "ExecutionPlan.h" 19 #include "HalInterfaces.h" 20 #include "Manager.h" 21 #include "ModelBuilder.h" 22 #include "NeuralNetworks.h" 23 #include "NeuralNetworksOEM.h" 24 #include "SampleDriver.h" 25 #include "TestNeuralNetworksWrapper.h" 26 #include "Utils.h" 27 #include "ValidateHal.h" 28 29 #include <gtest/gtest.h> 30 31 #include <filesystem> 32 #include <functional> 33 #include <map> 34 #include <queue> 35 #include <type_traits> 36 37 // Uncomment the following line to generate some debugging output that 38 // may be useful when analyzing failures: 39 // 40 // #define VERBOSE VERBOSE 41 42 // These tests do whitebox testing of the graph partitioning 43 // algorithm. It is "whitebox" in the sense that we're not evaluating 44 // whether a particular partitioning is legal, or "good enough" 45 // according to some metric, but whether it exactly matches the 46 // expected behavior of the current partitioning algorithm. 47 // 48 // A key part of the current partitioning algorithm is to determine 49 // which device among the available devices should be the one to 50 // execute a particular operation from the graph. This determination 51 // is made "locally" -- i.e., it does not depend on the graph 52 // topology, only on the properties of the operation in question. 53 // IDevice::getSupportedOperations() indicates which operations in a 54 // graph can be executed on a device, and IDevice::getCapabilities() 55 // indicates how "good" that device is for executing particular kinds 56 // of operations. For each operation, the partitioning algorithm 57 // picks the "best" device that is capable of executing that 58 // operation; if no device can do so, then the algorithm picks the 59 // cpu. 60 // 61 // As part of this testing approach, we want to make it easy to 62 // specify which operations in a test graph can be executed on which 63 // devices. We accomplish this in the following way: 64 // - A unary OEM operation is available. 65 // - There is a collection of operations (each of which has two inputs 66 // and one output): 67 // - Eight kinds of operations available at driver version V1_0 or 68 // later. They are represented in the graph as ADD or MUL with a 69 // particular activation function -- two opcodes times four 70 // activation functions means eight available operation kinds. 71 // This is a low-level representation detail -- when we specify the 72 // behavior of the device or build a graph, we do so in terms of 73 // operation encodings 0..7. 74 // - Eight kinds of operations available at driver version V1_1 or 75 // later. They are represented in the graph as DIV or SUB with 76 // a particular activation function, exactly analogous to ADD 77 // and MUL above. We use operation encodings 8..15 for them. 78 // - Four kinds of operations available at driver version V1_2 or 79 // later. They are represented in the graph as MAXIMUM, 80 // MINIMUM, POW, or PRELU. These operations take no activation 81 // function, so we only get 4 operation kinds, for which we 82 // use operation encodings 16..19. 83 // When we instantiate a device for testing purposes, we specify what subset of 84 // those operations the device is able to execute. 85 // 86 // In order to determine whether or not a partitioning matches the 87 // expected partitioning, we check the number of partitions, check 88 // which device each partition targets, and compare each partition's 89 // subgraph, model inputs, model outputs, submodel inputs, and 90 // submodel outputs against what is expected. In order to perform 91 // that comparison, we build a model to compare against a partition's 92 // submodel and run a graph comparison algorithm on it. The graph 93 // comparison and the inputs and outputs comparisons are syntactic 94 // rather than semantic comparisons -- they don't allow for 95 // reorderings of inputs and outputs. Because of this, we need to 96 // know exactly how the partitioning algorithm orders inputs and 97 // outputs in order to construct the models and operand lists to 98 // compare against. Here are some relevant behaviors of the 99 // partitioning algorithm: 100 // 101 // - It builds a subgraph by walking operations in forward topological 102 // order, and adding each operation's input operands and output 103 // operands in index order (input followed by output) when that 104 // operation is added. (It does not add an input that has already 105 // been added.) 106 // - It finds model inputs, model outputs, and submodel inputs in 107 // the order the corresponding operands were added to the subgraph 108 // (see ExecutionStep methods getModelInputs(), getModelOutputs(), 109 // getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()). 110 // - It finds temps as submodel outputs in numerical order of corresponding 111 // operand number in the original model (see ExecutionStep method 112 // getTempsAsSubModelOutputs()). 113 // - When it calls identifyInputsAndOutputs() on the submodel, it 114 // passes inputs from getModelInputs() in order, followed by temps as 115 // submodel inputs from getTempsAsSubModelInputs() in order, 116 // followed by outputs as submodel inputs from 117 // getOutputsAsSubModelInputs() in order; and it passes outputs from 118 // getModelOutputs() in order followed by submodel outputs from 119 // getTempsAsSubModelOutputs() in order. 120 // 121 // TODO: Maybe the logic for comparing a partition to an expected 122 // model should be changed to tolerate reorderings of inputs and 123 // outputs, so that when we build models and lists to compare 124 // against, we don't need to worry about input and output 125 // orderings. But is there a way to do this that still lets us 126 // verify that we have the correct relationships between 127 // an (original) model's inputs and outputs and each submodel's 128 // inputs and outputs, as well as the correct relationship 129 // between submodel inputs and outputs across partitions? 130 131 namespace { 132 133 const Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX}; 134 135 using CompilationBuilder = ::android::nn::CompilationBuilder; 136 using Device = ::android::nn::Device; 137 using DeviceManager = ::android::nn::DeviceManager; 138 using ExecutePreference = ::android::nn::test_wrapper::ExecutePreference; 139 using ExecutionPlan = ::android::nn::ExecutionPlan; 140 using ExecutionStep = ::android::nn::ExecutionStep; 141 using HalVersion = ::android::nn::HalVersion; 142 using HidlModel = ::android::hardware::neuralnetworks::V1_2::Model; 143 using HidlToken = 144 ::android::hardware::hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>; 145 using ModelBuilder = ::android::nn::ModelBuilder; 146 using Result = ::android::nn::test_wrapper::Result; 147 using SampleDriver = ::android::nn::sample_driver::SampleDriver; 148 using WrapperSymmPerChannelQuantParams = ::android::nn::test_wrapper::SymmPerChannelQuantParams; 149 using WrapperCompilation = ::android::nn::test_wrapper::Compilation; 150 using WrapperModel = ::android::nn::test_wrapper::Model; 151 using WrapperOperandType = ::android::nn::test_wrapper::OperandType; 152 using WrapperType = ::android::nn::test_wrapper::Type; 153 154 template <typename T> using sp = ::android::sp<T>; 155 template <typename T> 156 using MQDescriptorSync = ::android::hardware::MQDescriptorSync<T>; 157 158 Capabilities makeCapabilities(float perf) { 159 PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf}; 160 return {.relaxedFloat32toFloat16PerformanceScalar = perfInfo, 161 .relaxedFloat32toFloat16PerformanceTensor = perfInfo, 162 .operandPerformance = ::android::nn::nonExtensionOperandPerformance(perfInfo)}; 163 }; 164 165 void update(Capabilities* capabilities, OperandType type, float perf) { 166 PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf}; 167 ::android::nn::update(&capabilities->operandPerformance, type, perfInfo); 168 } 169 170 float lookupExecTime(const Capabilities& capabilities, OperandType type) { 171 return ::android::nn::lookup(capabilities.operandPerformance, type).execTime; 172 } 173 174 const uint32_t kNumFuseCodes = 4; 175 const uint32_t kBadOperation = ~0; 176 177 // V1_0 operations 178 const uint32_t kFirstEncodingADD = 0; 179 const uint32_t kFirstEncodingMUL = kFirstEncodingADD + kNumFuseCodes; 180 const uint32_t kFirstEncodingV1_0 = kFirstEncodingADD; 181 const uint32_t kLastEncodingV1_0 = kFirstEncodingMUL + kNumFuseCodes - 1; 182 183 // V1_1 operations 184 const uint32_t kFirstEncodingDIV = kLastEncodingV1_0 + 1; 185 const uint32_t kFirstEncodingSUB = kFirstEncodingDIV + kNumFuseCodes; 186 const uint32_t kFirstEncodingV1_1 = kFirstEncodingDIV; 187 const uint32_t kLastEncodingV1_1 = kFirstEncodingSUB + kNumFuseCodes - 1; 188 189 // V1_2 operations 190 const uint32_t kFirstEncodingMAXIMUM = kLastEncodingV1_1 + 1; 191 const uint32_t kFirstEncodingMINIMUM = kFirstEncodingMAXIMUM + 1; 192 const uint32_t kFirstEncodingPOW = kFirstEncodingMINIMUM + 1; 193 const uint32_t kFirstEncodingPRELU = kFirstEncodingPOW + 1; 194 const uint32_t kFirstEncodingV1_2 = kFirstEncodingMAXIMUM; 195 const uint32_t kLastEncodingV1_2 = kFirstEncodingPRELU; 196 197 const std::map<OperationType, uint32_t> operationToFirstEncoding = { 198 {OperationType::ADD, kFirstEncodingADD}, 199 {OperationType::MUL, kFirstEncodingMUL}, 200 {OperationType::DIV, kFirstEncodingDIV}, 201 {OperationType::SUB, kFirstEncodingSUB}, 202 {OperationType::MAXIMUM, kFirstEncodingMAXIMUM}, 203 {OperationType::MINIMUM, kFirstEncodingMINIMUM}, 204 {OperationType::POW, kFirstEncodingPOW}, 205 {OperationType::PRELU, kFirstEncodingPRELU}, 206 }; 207 208 // Sorted in reverse order (std::greater) so that we can use map::lower_bound to 209 // find an entry whose key is numerically less than or equal to a search value. 210 // mapped_type is (OperandCode, hasFuseCode). 211 const std::map<uint32_t, std::pair<uint32_t, bool>, std::greater<>> firstEncodingToOperation = { 212 {kFirstEncodingADD, {ANEURALNETWORKS_ADD, true}}, 213 {kFirstEncodingMUL, {ANEURALNETWORKS_MUL, true}}, 214 {kFirstEncodingDIV, {ANEURALNETWORKS_DIV, true}}, 215 {kFirstEncodingSUB, {ANEURALNETWORKS_SUB, true}}, 216 {kFirstEncodingMAXIMUM, {ANEURALNETWORKS_MAXIMUM, false}}, 217 {kFirstEncodingMINIMUM, {ANEURALNETWORKS_MINIMUM, false}}, 218 {kFirstEncodingPOW, {ANEURALNETWORKS_POW, false}}, 219 {kFirstEncodingPRELU, {ANEURALNETWORKS_PRELU, false}}, 220 }; 221 222 // Look up the operation with the specified index in a graph, and return the 223 // operation encoding; or, if for some reason this is not one of the encoded 224 // operations, then return kBadOperation. 225 uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation, 226 std::function<const Operand&(uint32_t)> getOperand, 227 std::function<const uint8_t*(uint32_t)> getValue, 228 uint32_t operationIndex) { 229 const Operation& operation = getOperation(operationIndex); 230 switch (operation.type) { 231 case OperationType::ADD: 232 case OperationType::MUL: 233 case OperationType::DIV: 234 case OperationType::SUB: { 235 // input2 is the fused activation function 236 const Operand& input2 = getOperand(operation.inputs[2]); 237 if ((input2.type == OperandType::INT32) && 238 (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) { 239 int32_t value; 240 CHECK_EQ(sizeof(value), input2.location.length); 241 memcpy(&value, 242 getValue(input2.location.offset), 243 input2.location.length); 244 return value + operationToFirstEncoding.at(operation.type); 245 } 246 break; 247 } 248 default: { 249 auto it = operationToFirstEncoding.find(operation.type); 250 if (it != operationToFirstEncoding.end()) { 251 return it->second; 252 } 253 break; 254 } 255 } 256 return kBadOperation; 257 } 258 259 uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) { 260 return lookupOperation( 261 [&model](uint32_t index) -> const Operation& { 262 return model.operations[index]; 263 }, 264 [&model](uint32_t index) -> const Operand& { 265 return model.operands[index]; 266 }, 267 [&model](uint32_t offset) {return &model.operandValues[offset];}, 268 operationIndex); 269 } 270 271 #ifdef VERBOSE 272 // This is a debugging utility function 273 void dump(const char* name, const ModelBuilder* model) { 274 HidlModel hidlModel; 275 model->setHidlModel(&hidlModel); 276 std::cout << name << ": " << toString(hidlModel) << std::endl; 277 std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl; 278 std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl; 279 for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) { 280 std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl; 281 } 282 } 283 #endif 284 285 // This is an IDevice for testing purposes. It only has a few 286 // interesting properties, all of which are specified as constructor 287 // arguments: device capabilities; which subset of operation kinds 288 // (0..19) does the device support; does the device support the OEM 289 // operation. The subset is represented with a bitmask, in which 290 // operation kind K corresponds to the bit (1 << K). 291 class PartitioningDriver : public SampleDriver { 292 private: 293 // Dummy class -- a prepared model must not be nullptr. 294 class PartitioningPreparedModel : public IPreparedModel { 295 public: 296 Return<ErrorStatus> execute(const Request&, const sp<V1_0::IExecutionCallback>&) override { 297 return ErrorStatus::DEVICE_UNAVAILABLE; 298 } 299 Return<ErrorStatus> execute_1_2(const Request&, MeasureTiming, 300 const sp<V1_2::IExecutionCallback>&) override { 301 return ErrorStatus::DEVICE_UNAVAILABLE; 302 } 303 Return<void> executeSynchronously(const Request&, MeasureTiming, 304 executeSynchronously_cb cb) override { 305 cb(ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming); 306 return Void(); 307 } 308 Return<void> configureExecutionBurst( 309 const sp<V1_2::IBurstCallback>& /*callback*/, 310 const MQDescriptorSync<V1_2::FmqRequestDatum>& /*requestChannel*/, 311 const MQDescriptorSync<V1_2::FmqResultDatum>& /*resultChannel*/, 312 configureExecutionBurst_cb cb) override { 313 cb(ErrorStatus::DEVICE_UNAVAILABLE, nullptr); 314 return Void(); 315 } 316 }; 317 public: 318 enum OEM { 319 OEMNo, // rejected by getSupportedOperations and prepareModel 320 OEMIndecisive, // accepted by getSupportedOperations but not prepareModel 321 OEMYes, // accepted by getSupportedOperations and prepareModel 322 }; 323 324 PartitioningDriver(const char* name, const char* version, Capabilities capabilities, 325 uint32_t operationMask, OEM oem = OEMNo) 326 : SampleDriver(name), 327 mVersionString(version), 328 mCapabilities(capabilities), 329 mOperationMask(operationMask), 330 mOEM(oem) {} 331 ~PartitioningDriver() override {} 332 333 Return<void> getVersionString(getVersionString_cb cb) override { 334 cb(ErrorStatus::NONE, mVersionString); 335 return Void(); 336 } 337 338 Return<ErrorStatus> prepareModel_1_2(const Model& model, ExecutionPreference, 339 const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, 340 const HidlToken&, 341 const sp<IPreparedModelCallback>& cb) override { 342 ErrorStatus status = ErrorStatus::NONE; 343 if (mOEM != OEMYes) { 344 for (const auto& operation : model.operations) { 345 if (operation.type == OperationType::OEM_OPERATION) { 346 status = ErrorStatus::INVALID_ARGUMENT; 347 break; 348 } 349 } 350 } 351 cb->notify_1_2(status, new PartitioningPreparedModel); 352 return status; 353 } 354 355 Return<DeviceStatus> getStatus() override { 356 return DeviceStatus::AVAILABLE; 357 } 358 359 Return<void> getCapabilities_1_2(getCapabilities_1_2_cb cb) override { 360 cb(ErrorStatus::NONE, mCapabilities); 361 return Void(); 362 } 363 364 Return<void> getSupportedOperations_1_2(const Model& model, 365 getSupportedOperations_cb cb) override { 366 if (!android::nn::validateModel(model)) { 367 cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>()); 368 return Void(); 369 } 370 371 const size_t count = model.operations.size(); 372 std::vector<bool> supported(count); 373 for (size_t i = 0; i < count; i++) { 374 if (model.operations[i].type == OperationType::OEM_OPERATION) { 375 supported[i] = (mOEM != OEMNo); 376 continue; 377 } 378 supported[i] = false; 379 uint32_t operation = lookupOperation(model, i); 380 if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) { 381 supported[i] = true; 382 } 383 } 384 cb(ErrorStatus::NONE, supported); 385 return Void(); 386 } 387 388 Return<void> getNumberOfCacheFilesNeeded(getNumberOfCacheFilesNeeded_cb cb) override { 389 cb(ErrorStatus::NONE, /*numModelCache=*/1, /*numDataCache=*/1); 390 return Void(); 391 } 392 393 Return<ErrorStatus> prepareModelFromCache( 394 const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const HidlToken&, 395 const sp<V1_2::IPreparedModelCallback>& callback) override { 396 callback->notify_1_2(ErrorStatus::NONE, new PartitioningPreparedModel); 397 return ErrorStatus::NONE; 398 } 399 400 private: 401 std::string mVersionString; 402 Capabilities mCapabilities; 403 uint32_t mOperationMask; 404 OEM mOEM; 405 }; 406 407 // Like PartitioningDriver, but implementing 1.1 408 class PartitioningDriverV1_1 : public V1_1::IDevice { 409 public: 410 PartitioningDriverV1_1(const char* name, const char* version, Capabilities capabilities, 411 uint32_t operationMask, 412 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 413 : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {} 414 Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override { 415 return mDriverV1_2->getCapabilities_1_1(_hidl_cb); 416 } 417 Return<void> getSupportedOperations_1_1(const V1_1::Model& model, 418 getSupportedOperations_1_1_cb _hidl_cb) override { 419 return mDriverV1_2->getSupportedOperations_1_1(model, _hidl_cb); 420 } 421 Return<ErrorStatus> prepareModel_1_1( 422 const V1_1::Model& model, ExecutionPreference preference, 423 const sp<V1_0::IPreparedModelCallback>& actualCallback) override { 424 return mDriverV1_2->prepareModel_1_1(model, preference, actualCallback); 425 } 426 Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); } 427 Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override { 428 return mDriverV1_2->getCapabilities(_hidl_cb); 429 } 430 Return<void> getSupportedOperations(const V1_0::Model& model, 431 getSupportedOperations_cb _hidl_cb) override { 432 return mDriverV1_2->getSupportedOperations(model, _hidl_cb); 433 } 434 Return<ErrorStatus> prepareModel( 435 const V1_0::Model& model, 436 const sp<V1_0::IPreparedModelCallback>& actualCallback) override { 437 return mDriverV1_2->prepareModel(model, actualCallback); 438 } 439 440 private: 441 const sp<V1_2::IDevice> mDriverV1_2; 442 }; 443 444 // Like PartitioningDriver, but implementing 1.0 445 class PartitioningDriverV1_0 : public V1_0::IDevice { 446 public: 447 PartitioningDriverV1_0(const char* name, const char* version, Capabilities capabilities, 448 uint32_t operationMask, 449 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 450 : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {} 451 Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override { 452 return mDriverV1_2->getCapabilities(_hidl_cb); 453 } 454 Return<void> getSupportedOperations(const V1_0::Model& model, 455 getSupportedOperations_cb _hidl_cb) override { 456 return mDriverV1_2->getSupportedOperations(model, _hidl_cb); 457 } 458 Return<ErrorStatus> prepareModel( 459 const V1_0::Model& model, 460 const sp<V1_0::IPreparedModelCallback>& actualCallback) override { 461 return mDriverV1_2->prepareModel(model, actualCallback); 462 } 463 Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); } 464 465 private: 466 const sp<V1_2::IDevice> mDriverV1_2; 467 }; 468 469 // This class adds some simple abstractions and utilities on top of 470 // WrapperModel. For example, it provides methods that work in terms of 471 // operation kind (0..7); and because we care about graph topology rather than 472 // details of operand types and values, it greatly simplifies the process of 473 // creating operands. 474 class PartitioningModel : private WrapperModel { 475 public: 476 using WrapperModel::finish; 477 using WrapperModel::getHandle; 478 using WrapperModel::identifyInputsAndOutputs; 479 using WrapperModel::isValid; 480 using WrapperModel::relaxComputationFloat32toFloat16; 481 482 // Create a tensor operand of the specified type, and return the 483 // corresponding operand index. 484 uint32_t addFloatOperand() { return addOperand(WrapperType::TENSOR_FLOAT32); } 485 uint32_t addQuantOperand() { return addOperand(WrapperType::TENSOR_QUANT8_ASYMM); } 486 487 // Create an operand of the specified type, and return the corresponding 488 // operand index. 489 uint32_t addOperand(WrapperType wrapperType) { 490 switch (static_cast<int>(wrapperType)) { 491 case ANEURALNETWORKS_BOOL: 492 case ANEURALNETWORKS_FLOAT16: 493 case ANEURALNETWORKS_FLOAT32: 494 case ANEURALNETWORKS_INT32: 495 case ANEURALNETWORKS_UINT32: 496 case ANEURALNETWORKS_OEM_SCALAR: { 497 WrapperOperandType wrapperOperandType(wrapperType, {}); 498 mWrapperOperandType.push_back(wrapperOperandType); 499 return WrapperModel::addOperand(&wrapperOperandType); 500 } 501 502 case ANEURALNETWORKS_TENSOR_BOOL8: 503 case ANEURALNETWORKS_TENSOR_FLOAT16: 504 case ANEURALNETWORKS_TENSOR_FLOAT32: 505 case ANEURALNETWORKS_TENSOR_OEM_BYTE: { 506 WrapperOperandType wrapperOperandType(wrapperType, {1}); 507 mWrapperOperandType.push_back(wrapperOperandType); 508 return WrapperModel::addOperand(&wrapperOperandType); 509 } 510 511 case ANEURALNETWORKS_TENSOR_INT32: 512 case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM: 513 case ANEURALNETWORKS_TENSOR_QUANT8_SYMM: 514 case ANEURALNETWORKS_TENSOR_QUANT16_ASYMM: 515 case ANEURALNETWORKS_TENSOR_QUANT16_SYMM: { 516 WrapperOperandType wrapperOperandType(wrapperType, {1}, 1.0f); 517 mWrapperOperandType.push_back(wrapperOperandType); 518 return WrapperModel::addOperand(&wrapperOperandType); 519 } 520 521 case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL: { 522 WrapperOperandType wrapperOperandType(wrapperType, {1}, 0.0f, 0, 523 WrapperSymmPerChannelQuantParams({1.0f}, 0)); 524 mWrapperOperandType.push_back(wrapperOperandType); 525 return WrapperModel::addOperand(&wrapperOperandType); 526 } 527 528 default: 529 ADD_FAILURE() << "Unexpected type " << static_cast<uint32_t>(wrapperType); 530 return ~uint32_t(0); 531 } 532 } 533 534 enum class Dimensioned { NO, YES }; 535 536 // Create a V1_0 operation with two inputs and one output, specifying the 537 // operation kind (where 0 is the first V1_0 operation) and the input 538 // operand indexes. 539 // Returns the output operand index. 540 uint32_t addOperation2To1V1_0(uint32_t operation, const uint32_t input0, const uint32_t input1, 541 Dimensioned dimensionedOutput = Dimensioned::YES) { 542 CHECK_LE(operation, kLastEncodingV1_0 - kFirstEncodingV1_0); 543 return addOperation2To1(operation + kFirstEncodingV1_0, input0, input1, dimensionedOutput); 544 } 545 546 // Create a V1_1 operation with two inputs and one output, specifying the 547 // operation kind (where 0 is the first V1_1 operation) and the input 548 // operand indexes. 549 // Returns the output operand index. 550 uint32_t addOperation2To1V1_1(uint32_t operation, const uint32_t input0, const uint32_t input1, 551 Dimensioned dimensionedOutput = Dimensioned::YES) { 552 CHECK_LE(operation, kLastEncodingV1_1 - kFirstEncodingV1_1); 553 return addOperation2To1(operation + kFirstEncodingV1_1, input0, input1, dimensionedOutput); 554 } 555 556 // Create a V1_2 operation with two inputs and one output, specifying the 557 // operation kind (where 0 is the first V1_2 operation) and the input 558 // operand indexes. 559 // Returns the output operand index. 560 uint32_t addOperation2To1V1_2(uint32_t operation, const uint32_t input0, const uint32_t input1, 561 Dimensioned dimensionedOutput = Dimensioned::YES) { 562 CHECK_LE(operation, kLastEncodingV1_2 - kFirstEncodingV1_2); 563 return addOperation2To1(operation + kFirstEncodingV1_2, input0, input1, dimensionedOutput); 564 } 565 566 // Create an OEM operation with one input and one output, 567 // specifying the input operand index. Returns the output operand 568 // index. 569 uint32_t addOperationOEM1To1(const uint32_t input, 570 Dimensioned dimensionedOutput = Dimensioned::YES) { 571 uint32_t output = addOperandOfSameType(input, dimensionedOutput); 572 addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output }); 573 return output; 574 } 575 576 // Run the partitioning algorithm to create an ExecutionPlan. 577 int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, 578 ExecutePreference preference, ExecutionPlan* plan) { 579 return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork( 580 devices, static_cast<uint32_t>(preference), plan); 581 } 582 583 #ifdef VERBOSE 584 // This is a debugging utility function. 585 void dump(const char* name) const { 586 const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle()); 587 ::dump(name, mb); 588 } 589 #endif 590 591 private: 592 // Create an operation with two inputs and one output, specifying 593 // the operation kind and the input operand indexes. 594 // Returns the output operand index. 595 uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1, 596 Dimensioned dimensionedOutput = Dimensioned::YES) { 597 auto it = firstEncodingToOperation.lower_bound(operation); 598 CHECK(it != firstEncodingToOperation.end()); 599 ANeuralNetworksOperationType type = it->second.first; 600 if (it->second.second) { 601 int32_t fuseCode = operation - it->first; 602 uint32_t input2 = addIntOperand(fuseCode); 603 uint32_t output = addOperandOfSameType(input0, dimensionedOutput); 604 addOperation(type, {input0, input1, input2}, {output}); 605 return output; 606 } else { 607 uint32_t output = addOperandOfSameType(input0, dimensionedOutput); 608 addOperation(type, {input0, input1}, {output}); 609 return output; 610 } 611 } 612 613 // Create a scalar integer operand of the specified value, and 614 // return the corresponding operand index. 615 uint32_t addIntOperand(int32_t value) { 616 uint32_t operand = addOperand(WrapperType::INT32); 617 setOperandValue(operand, &value, sizeof(value)); 618 return operand; 619 } 620 621 // Create an operand of the same type as the specified operand, 622 // and return the operand index of the new operand. 623 uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) { 624 WrapperOperandType type = mWrapperOperandType.at(operand); 625 for (auto& dimension : type.dimensions) { 626 dimension = (dimensioned == Dimensioned::YES); 627 } 628 mWrapperOperandType.push_back(type); 629 return WrapperModel::addOperand(&type); 630 } 631 632 // operand index to operand type 633 std::vector<WrapperOperandType> mWrapperOperandType; 634 }; 635 636 // This class adds some utilities on top of WrapperCompilation. 637 class PartitioningCompilation : public WrapperCompilation { 638 public: 639 PartitioningCompilation(const PartitioningModel* model, 640 const std::vector<std::shared_ptr<Device>>& devices) { 641 ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model->getHandle()); 642 CompilationBuilder* c = nullptr; 643 int result = m->createCompilation(&c, devices); 644 EXPECT_EQ(result, 0); 645 mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c); 646 } 647 648 Result setPartitioning(uint32_t partitioning) { 649 return static_cast<Result>(builder()->setPartitioning(partitioning)); 650 } 651 652 using WrapperCompilation::finish; 653 654 const ExecutionPlan& getExecutionPlan() const { 655 return builder()->forTest_getExecutionPlan(); 656 } 657 658 private: 659 CompilationBuilder* builder() { 660 return reinterpret_cast<CompilationBuilder*>(getHandle()); 661 } 662 663 const CompilationBuilder* builder() const { 664 return reinterpret_cast<const CompilationBuilder*>(getHandle()); 665 } 666 }; 667 668 #ifdef VERBOSE 669 #define RETURN_TRUE() \ 670 { \ 671 std::cerr << "returning true from " << __LINE__ << std::endl; \ 672 return true; \ 673 } 674 #else 675 #define RETURN_TRUE() \ 676 { \ 677 return true; \ 678 } 679 #endif 680 #ifdef VERBOSE 681 #define RETURN_FALSE(MESSAGE) \ 682 { \ 683 std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \ 684 return false; \ 685 } 686 #else 687 #define RETURN_FALSE(MESSAGE) \ 688 { \ 689 return false; \ 690 } 691 #endif 692 693 class PartitioningTest : public ::testing::Test { 694 protected: 695 using RemapVectorType = ExecutionStep::RemapVectorType; 696 using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType; 697 698 virtual void SetUp() { 699 } 700 701 // From a vector of DeviceSpecification, create a vector of 702 // Devices. 703 struct DeviceSpecification { 704 DeviceSpecification(const std::string& name, const Capabilities& capabilities, 705 uint32_t operationMask, 706 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 707 : mName(name), 708 mVersionString(kVersionString), 709 mCapabilities(capabilities), 710 mOperationMask(operationMask), 711 mOEM(oem) {} 712 DeviceSpecification(const std::string& name, float perf, uint32_t operationMask, 713 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 714 : DeviceSpecification(name, perf, perf, operationMask, oem) {} 715 DeviceSpecification(const std::string& name, float perf, float perfRelaxed, 716 uint32_t operationMask, 717 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 718 : DeviceSpecification(name, kVersionString, perf, perfRelaxed, operationMask, oem) {} 719 DeviceSpecification(const std::string& name, const std::string& version, float perf, 720 uint32_t operationMask, 721 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 722 : DeviceSpecification(name, version, perf, perf, operationMask, oem) {} 723 DeviceSpecification(const std::string& name, const std::string& version, float perf, 724 float perfRelaxed, uint32_t operationMask, 725 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) 726 : mName(name), mVersionString(version), mOperationMask(operationMask), mOEM(oem) { 727 PerformanceInfo perfRelaxedInfo = {.execTime = perfRelaxed, .powerUsage = perfRelaxed}; 728 mCapabilities = {.relaxedFloat32toFloat16PerformanceScalar = perfRelaxedInfo, 729 .relaxedFloat32toFloat16PerformanceTensor = perfRelaxedInfo, 730 .operandPerformance = ::android::nn::nonExtensionOperandPerformance( 731 {.execTime = perf, .powerUsage = perf})}; 732 } 733 DeviceSpecification(const std::string& name, float perf, HalVersion halVersion, 734 uint32_t operationMaskV1_0, uint32_t operationMaskV1_1 = 0, 735 uint32_t operationMaskV1_2 = 0) 736 : DeviceSpecification(name, perf, perf, 737 makeOperationMask(halVersion, operationMaskV1_0, 738 operationMaskV1_1, operationMaskV1_2)) { 739 mHalVersion = halVersion; 740 } 741 742 std::string mName; 743 std::string mVersionString; 744 Capabilities mCapabilities; 745 HalVersion mHalVersion = HalVersion::LATEST; 746 uint32_t mOperationMask; 747 PartitioningDriver::OEM mOEM = PartitioningDriver::OEMNo; 748 749 static constexpr char kVersionString[] = "JUST_AN_EXAMPLE"; 750 751 private: 752 // This function takes three operation masks aligned at the low-order 753 // bit -- one mask each for V1_0, V1_1, and V1_2 -- and produces a single 754 // composite operation mask, formed by shifting each of the input 755 // operation masks appropriately and ORing the results together. 756 // 757 // For convenience, any bits of an input mask that are too high order 758 // for that mask are discarded -- this allows ~0 to be a legal input 759 // mask. 760 // 761 // For the sake of example, assume that each low order mask is 4 bits 762 // wide, and take some artistic license to write literals in binary. 763 // Then: 764 // 765 // assert(makeOperationMask(HalVersion::V1_2, 0b0110, 0b1001, 0b0101) == 766 // 0b 0101 1001 0110); 767 // 768 // This is used by a DeviceSpecification constructor to build a mask of 769 // operations to be supported by the device. 770 static uint32_t makeOperationMask(HalVersion halVersion, uint32_t operationMaskV1_0, 771 uint32_t operationMaskV1_1, uint32_t operationMaskV1_2) { 772 if (halVersion < HalVersion::V1_2) { 773 CHECK(!operationMaskV1_2); 774 } 775 if (halVersion < HalVersion::V1_1) { 776 CHECK(!operationMaskV1_1); 777 } 778 auto maskOfWidth = [](uint32_t width) -> uint32_t { return (1U << width) - 1; }; 779 static const uint32_t kOperationMaskV1_0 = 780 maskOfWidth(kLastEncodingV1_0 - kFirstEncodingV1_0 + 1); 781 static const uint32_t kOperationMaskV1_1 = 782 maskOfWidth(kLastEncodingV1_1 - kFirstEncodingV1_1 + 1); 783 static const uint32_t kOperationMaskV1_2 = 784 maskOfWidth(kLastEncodingV1_2 - kFirstEncodingV1_2 + 1); 785 return ((operationMaskV1_0 & kOperationMaskV1_0) << kFirstEncodingV1_0) | 786 ((operationMaskV1_1 & kOperationMaskV1_1) << kFirstEncodingV1_1) | 787 ((operationMaskV1_2 & kOperationMaskV1_2) << kFirstEncodingV1_2); 788 } 789 }; 790 static std::vector<std::shared_ptr<Device>> makeDevices( 791 std::vector<DeviceSpecification> specifications) { 792 std::vector<std::shared_ptr<Device>> devices; 793 for (const auto& specification : specifications) { 794 V1_0::IDevice* halDriver = nullptr; 795 switch (specification.mHalVersion) { 796 case HalVersion::V1_2: 797 halDriver = new PartitioningDriver( 798 specification.mName.c_str(), specification.mVersionString.c_str(), 799 specification.mCapabilities, specification.mOperationMask, 800 specification.mOEM); 801 break; 802 case HalVersion::V1_1: 803 halDriver = new PartitioningDriverV1_1( 804 specification.mName.c_str(), specification.mVersionString.c_str(), 805 specification.mCapabilities, specification.mOperationMask, 806 specification.mOEM); 807 break; 808 case HalVersion::V1_0: 809 halDriver = new PartitioningDriverV1_0( 810 specification.mName.c_str(), specification.mVersionString.c_str(), 811 specification.mCapabilities, specification.mOperationMask, 812 specification.mOEM); 813 break; 814 default: 815 ADD_FAILURE() << "Unexpected"; 816 } 817 auto device = DeviceManager::forTest_makeDriverDevice(specification.mName, halDriver); 818 devices.push_back(device); 819 } 820 devices.push_back(DeviceManager::getCpuDevice()); 821 return devices; 822 } 823 824 /*-- Graph comparision ----------------------------------------------------------------*/ 825 826 // An operand with certain values for its lifetime does not have a 827 // defining operation in the graph. For the purposes of the graph 828 // comparison algorithm, we encode the "defining operation" index of 829 // such an operand as follows: 830 // - NO_VALUE kPseudoDefiningOperationNoValue 831 // - MODEL_INPUT kPseudoDefiningOperationModelInput0 + (position in list of inputs) 832 // - CONSTANT_COPY kPseudoDefiningOperationConstantCopy0 + (constant value) 833 // Note: For the graphs we build in this test, we 834 // only expect to see 4-byte constants within 835 // a very restricted range, so we only make 836 // room for such constants in our encoding 837 // space. 838 // We do not expect to see CONSTANT_REFERENCE, and so we do not handle 839 // it. 840 // 841 // The encoding is intended to be relatively human readable; it is not 842 // designed to represent some optimal balance of ranges for the items 843 // within its scope (actual operations, inputs, constants). 844 845 enum PseudoDefiningOperationEncodings : uint32_t { 846 kPseudoDefiningOperationModelInput0 = 0x80000000U, 847 kPseudoDefiningOperationConstantCopy0 = 0x90000000U, 848 kPseudoDefiningOperationNoValue = 0xeeeeeeeeU, 849 850 // lowest value for special encoding 851 kPseudoDefiningOperationBase = 0x80000000U, 852 853 // range of encoded input or constant 854 kPseudoDefiningOperationRange = 0x10000000U, 855 }; 856 857 // Build a map from operand to defining operation. 858 // TODO: Replace map with vector? 859 void buildDefinitionMap(const ModelBuilder* model, 860 std::map<uint32_t, uint32_t>* defMap) { 861 // actual definitions 862 ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase); 863 for (uint32_t i = 0, e = model->operationCount(); i < e; i++) { 864 const Operation& operation = model->getOperation(i); 865 for (uint32_t output : operation.outputs) { 866 (*defMap)[output] = i; 867 } 868 } 869 // inputs 870 ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange); 871 for (uint32_t i = 0, e = model->inputCount(); i < e; i++) { 872 (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i; 873 } 874 // look for NO_VALUE and CONSTANT_COPY 875 for (uint32_t i = 0, e = model->operandCount(); i < e; i++) { 876 const Operand& operand = model->getOperand(i); 877 switch (operand.lifetime) { 878 case OperandLifeTime::NO_VALUE: 879 (*defMap)[i] = kPseudoDefiningOperationNoValue; 880 break; 881 case OperandLifeTime::CONSTANT_COPY: { 882 ASSERT_EQ(operand.location.length, sizeof(uint32_t)); 883 uint32_t value; 884 memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t)); 885 ASSERT_LT(value, kPseudoDefiningOperationNoValue); 886 (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value; 887 break; 888 } 889 case OperandLifeTime::TEMPORARY_VARIABLE: 890 case OperandLifeTime::MODEL_INPUT: 891 case OperandLifeTime::MODEL_OUTPUT: 892 // already handled 893 break; 894 default: 895 FAIL(); 896 break; 897 } 898 } 899 // sanity check 900 ASSERT_EQ(model->operandCount(), defMap->size()); 901 } 902 903 #ifdef VERBOSE 904 void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) { 905 auto writeNum = [](uint32_t num) { 906 if (num >= kPseudoDefiningOperationBase) { 907 std::cout << "0x" << std::hex << num << std::dec; 908 } else { 909 std::cout << num; 910 } 911 }; 912 913 std::cout << name << ": { "; 914 bool gotOne = false; 915 for (const auto& entry : *aMap) { 916 if (gotOne) { 917 std::cout << ", "; 918 } else { 919 gotOne = true; 920 } 921 std::cout << "("; 922 writeNum(entry.first); 923 std::cout << ", "; 924 writeNum(entry.second); 925 std::cout << ")"; 926 } 927 std::cout << " }" << std::endl; 928 } 929 #endif 930 931 bool compare(const Operand& operandA, const Operand& operandB) { 932 if (operandA.type != operandB.type || 933 operandA.dimensions != operandB.dimensions || 934 operandA.numberOfConsumers != operandB.numberOfConsumers || 935 operandA.scale != operandB.scale || 936 operandA.zeroPoint != operandB.zeroPoint) { 937 return false; 938 } 939 return true; 940 } 941 942 // Compare two graphs. We ignore operand and operation indexes (i.e., 943 // two nodes can be the same even if they are numbered differently) 944 // but we also ignore semantics (e.g., even if an operation kind is 945 // such that the operand is commutative, we still pay attention to the 946 // order of its input operands). 947 // 948 // The comparison algorithm works by walking modelA from outputs 949 // towards inputs, along the edge from each operand to its 950 // defining operation, and then along the edges to the operation's 951 // input operands. At each step along the way, we try to match up 952 // operands and operations from modelA with equivalent operands 953 // and operations from modelB. 954 // 955 // We start by assuming that modelA's outputs and modelB's outputs 956 // match positionally (e.g., modelA's first output operand is 957 // equivalent to modelB's first output operand). Once we've 958 // discovered two equivalent operands (such as those outputs), we 959 // place them in a work queue. We repeatedly pull operands off 960 // the queue and compare their defining operations and those 961 // operations' input operands, to discover more pairs of 962 // equivalent operands. If we ever find operations that do not 963 // match (e.g., because operation kind differs), or operands that 964 // do not match (e.g., because operand type differs); or if we 965 // ever find a conflict (we've already decided that operand A's 966 // equivalent operand is B0, but it looks like we need its 967 // equivalent operand to be B1); then the graphs compare unequal. 968 // Otherwise, we'll eventually exhaust the work queue, and 969 // conclude that the graphs compare equal. 970 // 971 // As a side effect of the comparison, we produce a map 972 // *inputsAndOutputsBToA that maps from each of the model input and output 973 // operand numbers of modelB to the corresponding operand numbers of modelA. 974 // If the comparison returns false, the contents of the map are undefined. 975 bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB, 976 std::map<uint32_t, uint32_t>* inputsAndOutputsBToA) { 977 CHECK(inputsAndOutputsBToA != nullptr); 978 EXPECT_TRUE(inputsAndOutputsBToA->empty()); 979 980 #ifdef VERBOSE 981 ::dump("compare(A)", modelA); 982 ::dump("compare(B)", modelB); 983 #endif 984 985 if (modelA->operandCount() != modelB->operandCount() || 986 modelA->operationCount() != modelB->operationCount() || 987 modelA->inputCount() != modelB->inputCount() || 988 modelA->outputCount() != modelB->outputCount()) { 989 RETURN_FALSE(); 990 } 991 992 // Maps from operand index to index of defining operation. 993 std::map<uint32_t, uint32_t> defsA, defsB; 994 buildDefinitionMap(modelA, &defsA); 995 buildDefinitionMap(modelB, &defsB); 996 if (HasFatalFailure()) return false; 997 998 // Maps from operand index in modelA to equivalent operand index 999 // in modelB; and from operation index in modelA to equivalent 1000 // operation index in modelB. 1001 std::map<uint32_t, uint32_t> equivalentOperandsAToB; 1002 std::map<uint32_t, uint32_t> equivalentOperationsAToB; 1003 1004 // Queue of operand indexes from modelA, each of whose defining 1005 // operations are to be checked for equivalence with modelB. 1006 std::queue<uint32_t> workQueueOperandsA; 1007 1008 // Seed operand equivalence map and work queue from model outputs. 1009 for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) { 1010 uint32_t outputA = modelA->getOutputOperandIndex(i); 1011 uint32_t outputB = modelB->getOutputOperandIndex(i); 1012 if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) { 1013 RETURN_FALSE(); 1014 } 1015 equivalentOperandsAToB[outputA] = outputB; 1016 workQueueOperandsA.push(outputA); 1017 } 1018 1019 #ifdef VERBOSE 1020 dump("defsA", &defsA); 1021 dump("defsB", &defsB); 1022 #endif 1023 1024 // Process the queue. 1025 uint32_t pseudoDefinitionCount = 0; 1026 while (!workQueueOperandsA.empty()) { 1027 #ifdef VERBOSE 1028 dump("equivalentOperandsAToB", &equivalentOperandsAToB); 1029 dump("equivalentOperationsAToB", &equivalentOperationsAToB); 1030 #endif 1031 uint32_t operandIndexA = workQueueOperandsA.front(); 1032 #ifdef VERBOSE 1033 std::cout << "operandIndexA: " << operandIndexA << std::endl; 1034 #endif 1035 workQueueOperandsA.pop(); 1036 uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA); 1037 1038 uint32_t operationIndexA = defsA.at(operandIndexA); 1039 uint32_t operationIndexB = defsB.at(operandIndexB); 1040 auto it = equivalentOperationsAToB.find(operationIndexA); 1041 if (it != equivalentOperationsAToB.end()) { 1042 if (it->second != operationIndexB) { 1043 RETURN_FALSE(); 1044 } 1045 continue; 1046 } 1047 1048 // We haven't identified an equivalent operation for 1049 // operationIndexA. 1050 1051 if ((operationIndexA >= kPseudoDefiningOperationBase) != 1052 (operationIndexB >= kPseudoDefiningOperationBase)) { 1053 RETURN_FALSE(); 1054 } 1055 // Either both operands have pseudo-definitions, or neither 1056 // does. 1057 if (operationIndexA >= kPseudoDefiningOperationBase) { 1058 // Both operands have pseudo-definitions. 1059 if (operationIndexA != operationIndexB) { 1060 RETURN_FALSE(); 1061 } 1062 equivalentOperationsAToB[operationIndexA] = operationIndexB; 1063 ++pseudoDefinitionCount; 1064 continue; 1065 } 1066 1067 // If we get here, neither operation A nor operation B is a 1068 // pseudo-definition. 1069 1070 const Operation& operationA = modelA->getOperation(operationIndexA); 1071 const Operation& operationB = modelB->getOperation(operationIndexB); 1072 if (operationA.type != operationB.type || 1073 operationA.inputs.size() != operationB.inputs.size() || 1074 operationA.outputs.size() != operationB.outputs.size()) { 1075 RETURN_FALSE(); 1076 } 1077 equivalentOperationsAToB[operationIndexA] = operationIndexB; 1078 for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) { 1079 uint32_t inputA = operationA.inputs[i]; 1080 uint32_t inputB = operationB.inputs[i]; 1081 auto it = equivalentOperandsAToB.find(inputA); 1082 if (it != equivalentOperandsAToB.end()) { 1083 if (it->second != inputB) { 1084 RETURN_FALSE(); 1085 } 1086 continue; 1087 } 1088 // We haven't identified an equivalent operand for inputA. 1089 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) { 1090 RETURN_FALSE(); 1091 } 1092 equivalentOperandsAToB[inputA] = inputB; 1093 workQueueOperandsA.push(inputA); 1094 } 1095 } 1096 1097 // Sanity check 1098 if (modelA->operandCount() != defsA.size() || 1099 modelA->operandCount() != defsB.size() || 1100 modelA->operandCount() != equivalentOperandsAToB.size() || 1101 modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) { 1102 RETURN_FALSE(); 1103 } 1104 1105 // Build *inputsAndOutputsBToA 1106 for (uint32_t aInputIndex : modelA->getInputOperandIndexes()) { 1107 (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aInputIndex)] = aInputIndex; 1108 } 1109 for (uint32_t aOutputIndex : modelA->getOutputOperandIndexes()) { 1110 (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aOutputIndex)] = aOutputIndex; 1111 } 1112 1113 RETURN_TRUE(); 1114 } 1115 1116 /*-------------------------------------------------------------------------------------*/ 1117 1118 // As a side effect of the comparison, we produce a map 1119 // *inputsAndOutputsModelToStep that maps from each of the model input and 1120 // output operand numbers of "model" to the corresponding operand numbers of 1121 // the submodel from "step". If the comparison returns false, the contents 1122 // of the map are undefined. 1123 bool compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model, 1124 std::shared_ptr<Device> device, 1125 std::map<uint32_t, uint32_t>* inputsAndOutputsModelToStep) { 1126 return (step->getDevice() == device) && 1127 compare(step->getSubModel(), 1128 reinterpret_cast<const ModelBuilder*>(model->getHandle()), 1129 inputsAndOutputsModelToStep); 1130 } 1131 1132 void compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model, 1133 std::shared_ptr<Device> device, const RemapVectorType& modelInputs, 1134 const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsSubModelInputs, 1135 const SubModelOutputSetType& tempsAsSubModelOutputs, 1136 const RemapVectorType& outputsAsSubModelInputs) { 1137 std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep; 1138 ASSERT_NO_FATAL_FAILURE( 1139 ASSERT_TRUE(compare(step, model, device, &inputsAndOutputsModelToStep))); 1140 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelInputs(), 1141 modelInputs)); 1142 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelOutputs(), 1143 modelOutputs)); 1144 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, 1145 step->getTempsAsSubModelInputs(), tempsAsSubModelInputs)); 1146 ASSERT_TRUE(compareSubModelOutputSets(inputsAndOutputsModelToStep, 1147 step->getTempsAsSubModelOutputs(), 1148 tempsAsSubModelOutputs)); 1149 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, 1150 step->getOutputsAsSubModelInputs(), 1151 outputsAsSubModelInputs)); 1152 } 1153 1154 private: 1155 static bool compareRemapVectors(const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep, 1156 const RemapVectorType& step, RemapVectorType model) { 1157 std::transform(model.begin(), model.end(), model.begin(), 1158 [&inputsAndOutputsModelToStep](const RemapVectorType::value_type& val) { 1159 return std::make_pair(val.first, 1160 inputsAndOutputsModelToStep.at(val.second)); 1161 }); 1162 return step == model; 1163 } 1164 1165 static bool compareSubModelOutputSets( 1166 const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep, 1167 const SubModelOutputSetType& step, const SubModelOutputSetType& model) { 1168 SubModelOutputSetType modelTransformed; 1169 std::transform( 1170 model.begin(), model.end(), std::inserter(modelTransformed, modelTransformed.end()), 1171 [&inputsAndOutputsModelToStep](const SubModelOutputSetType::value_type& val) { 1172 return std::make_pair(val.first, inputsAndOutputsModelToStep.at(val.second)); 1173 }); 1174 return step == modelTransformed; 1175 } 1176 }; 1177 1178 TEST_F(PartitioningTest, SimpleModel) { 1179 PartitioningModel model; 1180 uint32_t opnd0 = model.addFloatOperand(); 1181 uint32_t opnd1 = model.addFloatOperand(); 1182 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1); 1183 uint32_t opnd3 = model.addFloatOperand(); 1184 uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3); 1185 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 }); 1186 model.finish(); 1187 ASSERT_TRUE(model.isValid()); 1188 1189 // Simple partition (two devices are each capable of everything, one is the best). 1190 // No need to compare the original model to the model from the plan -- we 1191 // didn't actually do any partitioning. 1192 const auto devicesA = makeDevices({{"bad", 0.9, ~0U}, {"good", 0.5, ~0U}}); 1193 ExecutionPlan planA; 1194 ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA), 1195 ANEURALNETWORKS_NO_ERROR); 1196 ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1197 ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr); 1198 ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "good"); 1199 1200 // Simple partition (two devices are each capable of everything, none better than CPU). 1201 // No need to compare the original model to the model from the plan -- we 1202 // didn't actually do any partitioning. 1203 const auto devicesC = makeDevices({{"bad", 1.1, ~0U}, {"bad2", 1.0, ~0U}}); 1204 ExecutionPlan planC; 1205 ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC), 1206 ANEURALNETWORKS_NO_ERROR); 1207 ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1208 ASSERT_EQ(planC.forTest_simpleGetDevice(), DeviceManager::getCpuDevice()); 1209 1210 // Compound partition (two devices, each is capable of one of the 1211 // two operations). We could do more extensive checking here -- 1212 // for example, verify that each step within the plan has the 1213 // correct (model and submodel)x(inputs and outputs). 1214 const auto devicesB = makeDevices({{"0", 0.9, 1 << 0}, {"1", 0.5, 1 << 1}}); 1215 ExecutionPlan planB; 1216 ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB), 1217 ANEURALNETWORKS_NO_ERROR); 1218 ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 1219 const auto& stepsB = planB.forTest_compoundGetSteps(); 1220 ASSERT_EQ(stepsB.size(), size_t(2)); 1221 { 1222 // Build a model to compare against the submodel from stepsB[0]. 1223 PartitioningModel modelB0; 1224 uint32_t b0Opnd0 = modelB0.addFloatOperand(); 1225 uint32_t b0Opnd1 = modelB0.addFloatOperand(); 1226 uint32_t b0Opnd2 = modelB0.addOperation2To1V1_0(0, b0Opnd0, b0Opnd1); 1227 modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 }); 1228 modelB0.finish(); 1229 ASSERT_TRUE(modelB0.isValid()); 1230 1231 ASSERT_NO_FATAL_FAILURE( 1232 compare(stepsB[0], &modelB0, devicesB[0], 1233 RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}}, // modelInputs 1234 RemapVectorType{}, // modelOutputs 1235 RemapVectorType{}, // tempsAsSubModelInputs 1236 SubModelOutputSetType{{opnd2, b0Opnd2}}, // tempsAsSubModelOutputs 1237 RemapVectorType{})); // outputsAsSubModelInputs; 1238 } 1239 { 1240 // Build a model to compare against the submodel from stepsB[1]. 1241 PartitioningModel modelB1; 1242 uint32_t b1Opnd2 = modelB1.addFloatOperand(); 1243 uint32_t b1Opnd3 = modelB1.addFloatOperand(); 1244 uint32_t b1Opnd4 = modelB1.addOperation2To1V1_0(1, b1Opnd2, b1Opnd3); 1245 // Note: In the partitioning algorithm, submodel inputs follow 1246 // model inputs. In the original model "model", opnd2 is not 1247 // an input; so in the submodel "modelB1", the corresponding 1248 // input b1Opnd2 is a submodel input, and must follow the 1249 // model input b1Opnd3. 1250 modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 }); 1251 modelB1.finish(); 1252 ASSERT_TRUE(modelB1.isValid()); 1253 1254 ASSERT_NO_FATAL_FAILURE(compare(stepsB[1], &modelB1, devicesB[1], 1255 RemapVectorType{{opnd3, b1Opnd3}}, // modelInputs 1256 RemapVectorType{{opnd4, b1Opnd4}}, // modelOutputs 1257 RemapVectorType{{opnd2, b1Opnd2}}, // tempsAsSubModelInputs 1258 SubModelOutputSetType{}, // tempsAsSubModelOutputs 1259 RemapVectorType{})); // outputsAsSubModelInputs 1260 } 1261 } 1262 1263 TEST_F(PartitioningTest, SliceModel) { 1264 PartitioningModel model; 1265 uint32_t opnd0 = model.addFloatOperand(); 1266 uint32_t opnd1 = model.addFloatOperand(); 1267 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1); 1268 uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd0, opnd1); 1269 uint32_t opnd4 = model.addOperation2To1V1_1(0, opnd0, opnd1); 1270 uint32_t opnd5 = model.addOperation2To1V1_2(0, opnd2, opnd3); 1271 model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2, opnd4, opnd5}); 1272 model.finish(); 1273 ASSERT_TRUE(model.isValid()); 1274 1275 // Simple partition (V1_0, V1_1, V1_2 devices are available; V1_2 has best perf). 1276 // No need to compare the original model to the model from the plan -- we 1277 // didn't actually do any partitioning. 1278 const auto devicesA = makeDevices({{"V1_0", 0.8, HalVersion::V1_0, ~0U}, 1279 {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U}, 1280 {"V1_2", 0.6, HalVersion::V1_2, ~0U, ~0U, ~0U}}); 1281 ExecutionPlan planA; 1282 ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA), 1283 ANEURALNETWORKS_NO_ERROR); 1284 ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1285 ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr); 1286 ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "V1_2"); 1287 1288 // Compound partition (V1_0, V1_1, V1_2 devices are available, in decreasing 1289 // order of performance; model is distributed across all three devices). 1290 const auto devicesB = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U}, 1291 {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U}, 1292 {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}}); 1293 ExecutionPlan planB; 1294 ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB), 1295 ANEURALNETWORKS_NO_ERROR); 1296 ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 1297 const auto& stepsB = planB.forTest_compoundGetSteps(); 1298 ASSERT_EQ(stepsB.size(), size_t(3)); 1299 { 1300 // Build a model to compare against the submodel from stepsB[0]. 1301 PartitioningModel modelB0; 1302 uint32_t b0Opnd0 = modelB0.addFloatOperand(); 1303 uint32_t b0Opnd1 = modelB0.addFloatOperand(); 1304 uint32_t b0Opnd2 = modelB0.addOperation2To1V1_1(0, b0Opnd0, b0Opnd1); 1305 modelB0.identifyInputsAndOutputs({b0Opnd0, b0Opnd1}, {b0Opnd2}); 1306 modelB0.finish(); 1307 ASSERT_TRUE(modelB0.isValid()); 1308 1309 ASSERT_NO_FATAL_FAILURE( 1310 compare(stepsB[0], &modelB0, devicesB[1], 1311 RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}}, // modelInputs 1312 RemapVectorType{{opnd4, b0Opnd2}}, // modelOutputs 1313 RemapVectorType{}, // tempsAsSubModelInputs 1314 SubModelOutputSetType{}, // tempsAsSubModelOutputs 1315 RemapVectorType{})); // outputsAsSubModelInputs 1316 } 1317 { 1318 // Build a model to compare against the submodel from stepsB[1]. 1319 PartitioningModel modelB1; 1320 uint32_t b1Opnd0 = modelB1.addFloatOperand(); 1321 uint32_t b1Opnd1 = modelB1.addFloatOperand(); 1322 uint32_t b1Opnd2 = modelB1.addOperation2To1V1_0(0, b1Opnd0, b1Opnd1); 1323 uint32_t b1Opnd3 = modelB1.addOperation2To1V1_0(1, b1Opnd0, b1Opnd1); 1324 modelB1.identifyInputsAndOutputs({b1Opnd0, b1Opnd1}, {b1Opnd2, b1Opnd3}); 1325 modelB1.finish(); 1326 ASSERT_TRUE(modelB1.isValid()); 1327 1328 ASSERT_NO_FATAL_FAILURE( 1329 compare(stepsB[1], &modelB1, devicesB[0], 1330 RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}}, // modelInputs 1331 RemapVectorType{{opnd2, b1Opnd2}}, // modelOutputs 1332 RemapVectorType{}, // tempsAsSubModelInputs 1333 SubModelOutputSetType{{opnd3, b1Opnd3}}, // tempsAsSubModelOutputs 1334 RemapVectorType{})); // outputsAsSubModelInputs 1335 } 1336 { 1337 // Build a model to compare against the submodel from stepsB[2]. 1338 PartitioningModel modelB2; 1339 uint32_t b2Opnd0 = modelB2.addFloatOperand(); 1340 uint32_t b2Opnd1 = modelB2.addFloatOperand(); 1341 uint32_t b2Opnd2 = modelB2.addOperation2To1V1_2(0, b2Opnd0, b2Opnd1); 1342 // Note: In the partitioning algorithm, temps that are 1343 // submodel inputs precede model outputs that are submodel 1344 // inputs. In the original model "model", opnd3 is a temp and 1345 // opnd2 is a model output; so in the submodel "modelB2", the 1346 // corresponding inputs b2Opnd1 and b2Opnd0 must appear in 1347 // that order. 1348 modelB2.identifyInputsAndOutputs({b2Opnd1, b2Opnd0}, {b2Opnd2}); 1349 modelB2.finish(); 1350 ASSERT_TRUE(modelB2.isValid()); 1351 1352 ASSERT_NO_FATAL_FAILURE( 1353 compare(stepsB[2], &modelB2, devicesB[2], RemapVectorType{}, // modelInputs 1354 RemapVectorType{{opnd5, b2Opnd2}}, // modelOutputs 1355 RemapVectorType{{opnd3, b2Opnd1}}, // tempsAsSubModelInputs 1356 SubModelOutputSetType{}, // tempsAsSubModelOutputs 1357 RemapVectorType{{opnd2, b2Opnd0}})); // outputsAsSubModelInputs 1358 } 1359 1360 // TODO: Make sure this still works when we have multiple devices 1361 // of same version available for slicing. An easy (?) choice would 1362 // be to route the two different V1_0 operations to different 1363 // devices. 1364 } 1365 1366 TEST_F(PartitioningTest, SliceModelToEmpty) { 1367 PartitioningModel model; 1368 uint32_t opnd0 = model.addFloatOperand(); 1369 uint32_t opnd1 = model.addFloatOperand(); 1370 uint32_t opnd2 = model.addOperation2To1V1_2(0, opnd0, opnd1); 1371 model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2}); 1372 model.finish(); 1373 ASSERT_TRUE(model.isValid()); 1374 1375 // Only the V1_2 device can handle any operations in the model. 1376 // No need to compare the original model to the model from the plan -- we 1377 // didn't actually do any partitioning. 1378 const auto devices = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U}, 1379 {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U}, 1380 {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}}); 1381 ExecutionPlan plan; 1382 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1383 ANEURALNETWORKS_NO_ERROR); 1384 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1385 ASSERT_NE(plan.forTest_simpleGetDevice().get(), nullptr); 1386 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "V1_2"); 1387 } 1388 1389 TEST_F(PartitioningTest, Cpu) { 1390 // Here's a model where some operations execute only on the Cpu. 1391 // To make things interesting, we produce three partitions -- 1392 // device, cpu, same-device. 1393 1394 static const uint32_t kCpuOp = 1; 1395 static const uint32_t kDevOp = 2; 1396 1397 const auto devices = makeDevices({{"1", 0.5, 1 << kDevOp}}); 1398 1399 PartitioningModel model; 1400 1401 uint32_t opnd0 = model.addFloatOperand(); 1402 uint32_t opnd1 = model.addFloatOperand(); 1403 1404 uint32_t opnd2 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd1); 1405 uint32_t opnd3 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd2); 1406 1407 uint32_t opnd4 = model.addOperation2To1V1_0(kCpuOp, opnd0, opnd3); 1408 uint32_t opnd5 = model.addOperation2To1V1_0(kCpuOp, opnd2, opnd4); 1409 1410 uint32_t opnd6 = model.addFloatOperand(); 1411 1412 uint32_t opnd7 = model.addOperation2To1V1_0(kDevOp, opnd3, opnd5); 1413 uint32_t opnd8 = model.addOperation2To1V1_0(kDevOp, opnd6, opnd7); 1414 1415 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 }); 1416 model.finish(); 1417 ASSERT_TRUE(model.isValid()); 1418 1419 ExecutionPlan plan; 1420 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1421 ANEURALNETWORKS_NO_ERROR); 1422 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 1423 const auto& steps = plan.forTest_compoundGetSteps(); 1424 ASSERT_EQ(steps.size(), size_t(3)); 1425 { 1426 const auto& step0 = steps[0]; 1427 1428 // Build a model to compare against the submodel from steps[0]. 1429 PartitioningModel model0; 1430 uint32_t m0Opnd0 = model0.addFloatOperand(); 1431 uint32_t m0Opnd1 = model0.addFloatOperand(); 1432 uint32_t m0Opnd2 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd1); 1433 uint32_t m0Opnd3 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd2); 1434 model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 }); 1435 model0.finish(); 1436 ASSERT_TRUE(model0.isValid()); 1437 1438 ASSERT_NO_FATAL_FAILURE( 1439 compare(step0, &model0, devices[0], 1440 RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}}, // modelInputs 1441 RemapVectorType{}, // modelOutputs 1442 RemapVectorType{}, // tempsAsSubModelInputs 1443 SubModelOutputSetType{{opnd2, m0Opnd2}, 1444 {opnd3, m0Opnd3}}, // tempsAsSubModelOutputs 1445 RemapVectorType{})); // outputsAsSubModelInputs 1446 } 1447 { 1448 const auto& step1 = steps[1]; 1449 1450 // Build a model to compare against the submodel from steps[1]. 1451 PartitioningModel model1; 1452 uint32_t m1Opnd0 = model1.addFloatOperand(); 1453 uint32_t m1Opnd3 = model1.addFloatOperand(); 1454 uint32_t m1Opnd4 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd0, m1Opnd3); 1455 uint32_t m1Opnd2 = model1.addFloatOperand(); 1456 uint32_t m1Opnd5 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd2, m1Opnd4); 1457 model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 }); 1458 model1.finish(); 1459 ASSERT_TRUE(model1.isValid()); 1460 1461 ASSERT_NO_FATAL_FAILURE(compare( 1462 step1, &model1, DeviceManager::getCpuDevice(), 1463 RemapVectorType{{opnd0, m1Opnd0}}, // modelInputs 1464 RemapVectorType{{opnd4, m1Opnd4}}, // modelOutputs 1465 RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}}, // tempsAsSubModelInputs 1466 SubModelOutputSetType{{opnd5, m1Opnd5}}, // tempsAsSubModelOutputs 1467 RemapVectorType{})); // outputsAsSubModelInputs 1468 } 1469 { 1470 const auto& step2 = steps[2]; 1471 1472 // Build a model to compare against the submodel from steps[2]. 1473 PartitioningModel model2; 1474 uint32_t m2Opnd3 = model2.addFloatOperand(); 1475 uint32_t m2Opnd5 = model2.addFloatOperand(); 1476 uint32_t m2Opnd7 = model2.addOperation2To1V1_0(kDevOp, m2Opnd3, m2Opnd5); 1477 uint32_t m2Opnd6 = model2.addFloatOperand(); 1478 uint32_t m2Opnd8 = model2.addOperation2To1V1_0(kDevOp, m2Opnd6, m2Opnd7); 1479 model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 }); 1480 model2.finish(); 1481 ASSERT_TRUE(model2.isValid()); 1482 1483 ASSERT_NO_FATAL_FAILURE(compare( 1484 step2, &model2, devices[0], RemapVectorType{{opnd6, m2Opnd6}}, // modelInputs 1485 RemapVectorType{{opnd8, m2Opnd8}}, // modelOutputs 1486 RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}}, // tempsAsSubModelInputs 1487 SubModelOutputSetType{}, // tempsAsSubModelOutputs 1488 RemapVectorType{})); // outputsAsSubModelInputs 1489 } 1490 } 1491 1492 TEST_F(PartitioningTest, SetPartitioning) { 1493 PartitioningModel model; 1494 uint32_t opnd0 = model.addFloatOperand(); 1495 uint32_t opnd1 = model.addFloatOperand(); 1496 uint32_t opnd2 = 1497 model.addOperation2To1V1_0(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO); 1498 uint32_t opnd3 = model.addFloatOperand(); 1499 uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3); 1500 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 }); 1501 model.finish(); 1502 ASSERT_TRUE(model.isValid()); 1503 1504 // We expect that we cannot successfully partition, because we 1505 // have an intermediate operand (opnd2) without dimensions, and 1506 // this is not currently handled. 1507 1508 // One device that can and should execute operation 0. 1509 const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}}); 1510 1511 // Test kPartitioningNo. We should not even attempt partitioning, 1512 // so there should be a SIMPLE plan on CPU. 1513 // No need to compare the original model to the model from the plan -- we 1514 // didn't actually do any partitioning. 1515 PartitioningCompilation cPNo(&model, devices); 1516 ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR); 1517 ASSERT_EQ(cPNo.finish(), Result::NO_ERROR); 1518 ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1519 ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice()); 1520 1521 // Test kPartitioningWithFallback. We should attempt 1522 // partitioning, reach the end of the partitioning process (so we 1523 // have an unsuccessful execution plan), discover the dimensionless 1524 // intermediate operand, then fallback to CPU with a SIMPLE plan, and 1525 // finally return success. 1526 // No need to compare the original model to the model from the plan -- we 1527 // didn't actually do any partitioning. 1528 PartitioningCompilation cPWithFallback(&model, devices); 1529 ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR); 1530 ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR); 1531 ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1532 ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(), 1533 DeviceManager::getCpuDevice()); 1534 1535 // Test kPartitioningWithoutFallback. We should attempt 1536 // partitioning, and fail. 1537 PartitioningCompilation cPWithoutFallback(&model, devices); 1538 ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR); 1539 ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED); 1540 ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize()); 1541 ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR); 1542 } 1543 1544 // Regression test for http://b/69166603: 1545 // "partitioned compilation and execution yields wrong results when model output is submodel input" 1546 TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) { 1547 PartitioningModel model; 1548 uint32_t opnd0 = model.addFloatOperand(); 1549 uint32_t opnd1 = model.addFloatOperand(); 1550 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1); 1551 uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd2, opnd2); 1552 model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 }); 1553 model.finish(); 1554 ASSERT_TRUE(model.isValid()); 1555 1556 // Compound partition (two devices, each is capable of one of the 1557 // two operations). We could do more extensive checking here -- 1558 // for example, verify that each step within the plan has the 1559 // correct (model and submodel)x(inputs and outputs). 1560 const auto devices = makeDevices({{"0", 0.5, 1 << 0}, {"1", 0.5, 1 << 1}}); 1561 ExecutionPlan plan; 1562 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1563 ANEURALNETWORKS_NO_ERROR); 1564 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 1565 const auto& steps = plan.forTest_compoundGetSteps(); 1566 ASSERT_EQ(steps.size(), size_t(2)); 1567 { 1568 // Build a model to compare against the submodel from steps[0]. 1569 PartitioningModel model0; 1570 uint32_t m0Opnd0 = model0.addFloatOperand(); 1571 uint32_t m0Opnd1 = model0.addFloatOperand(); 1572 uint32_t m0Opnd2 = model0.addOperation2To1V1_0(0, m0Opnd0, m0Opnd1); 1573 model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 }); 1574 model0.finish(); 1575 ASSERT_TRUE(model0.isValid()); 1576 ASSERT_NO_FATAL_FAILURE( 1577 compare(steps[0], &model0, devices[0], 1578 RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}}, // modelInputs 1579 RemapVectorType{{opnd2, m0Opnd2}}, // modelOutputs 1580 RemapVectorType{}, // tempsAsSubModelInputs 1581 SubModelOutputSetType{}, // tempsAsSubModelOutputs 1582 RemapVectorType{})); // outputsAsSubModelInputs 1583 } 1584 { 1585 // Build a model to compare against the submodel from steps[1]. 1586 PartitioningModel model1; 1587 uint32_t m1Opnd2 = model1.addFloatOperand(); 1588 uint32_t m1Opnd3 = model1.addOperation2To1V1_0(1, m1Opnd2, m1Opnd2); 1589 model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 }); 1590 model1.finish(); 1591 ASSERT_TRUE(model1.isValid()); 1592 1593 ASSERT_NO_FATAL_FAILURE( 1594 compare(steps[1], &model1, devices[1], RemapVectorType{}, // modelInputs 1595 RemapVectorType{{opnd3, m1Opnd3}}, // modelOutputs 1596 RemapVectorType{}, // tempsAsSubModelInputs 1597 SubModelOutputSetType{}, // tempsAsSubModelOutputs 1598 RemapVectorType{{opnd2, m1Opnd2}})); // outputsAsSubModelInputs 1599 } 1600 } 1601 1602 TEST_F(PartitioningTest, OemOperations) { 1603 // Trivial model consisting solely of OEM operation. 1604 PartitioningModel model; 1605 uint32_t opndIn = model.addFloatOperand(); 1606 uint32_t opndOut = model.addOperationOEM1To1(opndIn); 1607 model.identifyInputsAndOutputs({ opndIn }, { opndOut }); 1608 model.finish(); 1609 ASSERT_TRUE(model.isValid()); 1610 1611 // Verify that the best driver than can run an OEM operation is 1612 // used, even if it is not better than the CPU. 1613 // No need to compare the original model to the model from the plan -- we 1614 // didn't actually do any partitioning. 1615 const auto devicesBestOEM = makeDevices({{"badOEM", 1.5, ~0U, PartitioningDriver::OEMYes}, 1616 {"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo}, 1617 {"goodOEM", 1.2, ~0U, PartitioningDriver::OEMYes}}); 1618 PartitioningCompilation compilationBestOEM(&model, devicesBestOEM); 1619 ASSERT_EQ(compilationBestOEM.finish(), Result::NO_ERROR); 1620 const auto& planBestOEM = compilationBestOEM.getExecutionPlan(); 1621 ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1622 ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr); 1623 ASSERT_STREQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM"); 1624 1625 // Verify that we get an error if no driver can run an OEM operation. 1626 const auto devicesNoOEM = makeDevices({{"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo}}); 1627 PartitioningCompilation compilationNoOEM(&model, devicesNoOEM); 1628 ASSERT_EQ(compilationNoOEM.finish(), Result::BAD_DATA); 1629 1630 // Verify that we get an error if a driver can SUPPORT but not PREPARE an OEM operation. 1631 const auto devicesIndecisiveOEM = 1632 makeDevices({{"indecisiveOEM", 0.5, ~0U, PartitioningDriver::OEMIndecisive}}); 1633 PartitioningCompilation compilationIndecisiveOEM(&model, devicesIndecisiveOEM); 1634 ASSERT_NE(compilationIndecisiveOEM.finish(), Result::NO_ERROR); 1635 1636 // Verify that we get an error if there are no drivers (only CPU fallback). 1637 PartitioningCompilation compilationNoDrivers(&model, makeDevices({}) /* no drivers */); 1638 ASSERT_EQ(compilationNoDrivers.finish(), Result::BAD_DATA); 1639 } 1640 1641 TEST_F(PartitioningTest, RelaxedFP) { 1642 const auto devices = makeDevices({// Best choice for non-relaxed model. 1643 {"f32", 0.8, 0.9 /* relaxed */, ~0U}, 1644 // Best choice for relaxed model. 1645 {"f16", 0.9, 0.8 /* relaxed */, ~0U}}); 1646 1647 auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) { 1648 // Trivial model consisting solely of one operation. 1649 SCOPED_TRACE(expectDevice); 1650 PartitioningModel model; 1651 uint32_t opnd0 = model.addFloatOperand(); 1652 uint32_t opnd1 = model.addFloatOperand(); 1653 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1); 1654 model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 }); 1655 model.relaxComputationFloat32toFloat16(doRelax); 1656 model.finish(); 1657 ASSERT_TRUE(model.isValid()); 1658 // Verify that the model will be executed on the appropriate device. 1659 // No need to compare the original model to the model from the plan -- we 1660 // didn't actually do any partitioning. 1661 ExecutionPlan plan; 1662 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1663 ANEURALNETWORKS_NO_ERROR); 1664 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1665 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), expectDevice); 1666 }; 1667 1668 ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32")); 1669 ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16")); 1670 } 1671 1672 TEST_F(PartitioningTest, Perf) { 1673 // The various type names used here are confusing. 1674 // 1675 // OperandType (from HAL file), WrapperType (from NeuralNetworksWrapper.h), 1676 // and OperandCode (from NeuralNetworks.h) are different enums representing 1677 // the same type kind -- e.g., OperandType::FLOAT32, WrapperType::FLOAT32, 1678 // ANEURALNETWORKS_FLOAT32. Corresponding enumerators have the same value. 1679 // 1680 // WrapperOperandType is the NeuralNetworksWrapper.h representation of a 1681 // full operand type (WrapperType plus dimensions plus other attributes). 1682 1683 auto TestType = [](OperandType operandType) { 1684 SCOPED_TRACE(toString(operandType)); 1685 // Trivial model consisting solely of OEM operation. We 1686 // pick OEM operation because this allows us to use 1687 // inputs and outputs of any number and type. 1688 PartitioningModel model; 1689 uint32_t opndIn = model.addOperand(static_cast<WrapperType>(operandType)); 1690 uint32_t opndOut = model.addOperationOEM1To1(opndIn); 1691 model.identifyInputsAndOutputs({opndIn}, {opndOut}); 1692 model.finish(); 1693 ASSERT_TRUE(model.isValid()); 1694 1695 const Capabilities baseCapabilities = makeCapabilities(0.5); 1696 1697 { 1698 // better than base 1699 Capabilities goodCapabilities = baseCapabilities; 1700 update(&goodCapabilities, operandType, 0.25); 1701 1702 const auto devices = 1703 makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes}, 1704 {"good", goodCapabilities, ~0U, PartitioningDriver::OEMYes}}); 1705 1706 // Verify that model will be executed on "good". 1707 // No need to compare the original model to the model from the plan -- we 1708 // didn't actually do any partitioning. 1709 ExecutionPlan plan; 1710 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1711 ANEURALNETWORKS_NO_ERROR); 1712 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1713 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "good"); 1714 } 1715 1716 { 1717 // worse than base 1718 Capabilities badCapabilities = baseCapabilities; 1719 update(&badCapabilities, operandType, 0.75); 1720 const auto devices = 1721 makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes}, 1722 {"bad", badCapabilities, ~0U, PartitioningDriver::OEMYes}}); 1723 1724 // Verify that model will be executed on "base". 1725 // No need to compare the original model to the model from the plan -- we 1726 // didn't actually do any partitioning. 1727 ExecutionPlan plan; 1728 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1729 ANEURALNETWORKS_NO_ERROR); 1730 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1731 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "base"); 1732 } 1733 }; 1734 1735 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN); 1736 type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) { 1737 TestType(static_cast<OperandType>(type)); 1738 } 1739 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN); 1740 type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) { 1741 TestType(static_cast<OperandType>(type)); 1742 } 1743 } 1744 1745 // Test token rehashing during the compilation step. 1746 class CacheTest : public PartitioningTest { 1747 protected: 1748 virtual void SetUp() override { 1749 PartitioningTest::SetUp(); 1750 char cacheDirTemp[] = "/data/local/tmp/TestCompilationCachingXXXXXX"; 1751 char* cacheDir = mkdtemp(cacheDirTemp); 1752 ASSERT_NE(cacheDir, nullptr); 1753 mCacheDir = cacheDir; 1754 } 1755 1756 virtual void TearDown() override { 1757 if (!::testing::Test::HasFailure()) { 1758 std::filesystem::remove_all(mCacheDir); 1759 } 1760 PartitioningTest::TearDown(); 1761 } 1762 1763 void expectUniqueTokens(const std::vector<std::vector<uint8_t>>& tokens) { 1764 for (uint32_t i = 0; i < tokens.size(); i++) { 1765 SCOPED_TRACE(i); 1766 for (uint32_t j = i + 1; j < tokens.size(); j++) { 1767 SCOPED_TRACE(j); 1768 EXPECT_NE(tokens[i], tokens[j]); 1769 } 1770 } 1771 } 1772 1773 // Launch a single run of the partitioner against the provided model and device list with 1774 // cache token privided as tokenIn. Find the partition for the device with deviceName. 1775 // Record the tranformed token into tokenOut. 1776 // If tokenIn is empty, no caching information will be provided to the partitioner. 1777 void getTransformedCacheTokenSingle(const PartitioningModel& model, 1778 const std::vector<std::shared_ptr<Device>>& devices, 1779 const char* deviceName, const std::vector<uint8_t>& tokenIn, 1780 ExecutePreference preference, 1781 std::vector<uint8_t>* tokenOut) { 1782 // Compile the model and get the execution plan. 1783 PartitioningCompilation compilation(&model, devices); 1784 if (!tokenIn.empty()) { 1785 compilation.setCaching(mCacheDir.c_str(), tokenIn); 1786 } 1787 compilation.setPreference(preference); 1788 ASSERT_EQ(compilation.finish(), Result::NO_ERROR); 1789 const ExecutionPlan& plan = compilation.getExecutionPlan(); 1790 1791 // Find the cache info for the device. 1792 const uint8_t* token = nullptr; 1793 if (plan.forTest_getKind() == ExecutionPlan::Kind::SIMPLE) { 1794 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), deviceName); 1795 token = plan.forTest_simpleGetCacheToken(); 1796 } else if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) { 1797 const auto& steps = plan.forTest_compoundGetSteps(); 1798 bool found = false; 1799 for (const auto& step : steps) { 1800 // In general, two or more partitions can be on the same device. However, this will 1801 // not happen on the test models with only 2 operations. 1802 if (strcmp(step->getDevice()->getName(), deviceName) == 0) { 1803 ASSERT_FALSE(found); 1804 token = step->forTest_getCacheToken(); 1805 found = true; 1806 } 1807 } 1808 ASSERT_TRUE(found); 1809 } else { 1810 FAIL(); 1811 } 1812 1813 // Retrieve the transformed token from the cache info. 1814 if (token == nullptr) { 1815 tokenOut->clear(); 1816 } else { 1817 tokenOut->resize(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN); 1818 std::copy(token, token + ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, tokenOut->begin()); 1819 } 1820 } 1821 1822 // A wrapper of getTransformedCacheTokenSingle, which runs getTransformedCacheTokenSingle 1823 // multiple times and checks if the transformation provides consistent result. 1824 void getTransformedCacheToken(const PartitioningModel& model, 1825 const std::vector<std::shared_ptr<Device>>& devices, 1826 const char* deviceName, const std::vector<uint8_t>& tokenIn, 1827 ExecutePreference preference, std::vector<uint8_t>* tokenOut) { 1828 getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, tokenOut); 1829 1830 // Test if the runtime maps to the same cache token every time for the same compilation 1831 // setup. 1832 for (uint32_t i = 0; i < 10; i++) { 1833 std::vector<uint8_t> token; 1834 SCOPED_TRACE(i); 1835 getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, &token); 1836 EXPECT_EQ(*tokenOut, token); 1837 } 1838 } 1839 1840 void CreateModelForCachingTests(PartitioningModel* model) { 1841 uint32_t opnd0 = model->addFloatOperand(); 1842 uint32_t opnd1 = model->addFloatOperand(); 1843 uint32_t opnd2 = model->addOperation2To1V1_0(0, opnd0, opnd1); 1844 uint32_t opnd3 = model->addFloatOperand(); 1845 uint32_t opnd4 = model->addOperation2To1V1_0(1, opnd2, opnd3); 1846 model->identifyInputsAndOutputs({opnd0, opnd1, opnd3}, {opnd4}); 1847 model->finish(); 1848 ASSERT_TRUE(model->isValid()); 1849 } 1850 1851 std::string mCacheDir; 1852 }; 1853 1854 // Test the case when no token is provided by the application and the execution plan has a 1855 // simple body. 1856 TEST_F(CacheTest, CacheTokenNoneSimpleBody) { 1857 PartitioningModel model; 1858 CreateModelForCachingTests(&model); 1859 1860 // deviceA can execute the whole model. 1861 const auto deviceA = makeDevices({ 1862 {"deviceA", 0.5, ~0U}, 1863 }); 1864 1865 std::vector<uint8_t> tokenIn, tokenOut; 1866 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn, 1867 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut); 1868 EXPECT_TRUE(tokenOut.empty()); 1869 } 1870 1871 // Test if the runtime maps to different cache tokens for devices with different names in 1872 // execution plan with a simple body. 1873 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesSimpleBody) { 1874 PartitioningModel model; 1875 CreateModelForCachingTests(&model); 1876 1877 // Two devices that can both execute the whole model. 1878 const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}}); 1879 const auto deviceB = makeDevices({{"deviceB", 0.5, ~0U}}); 1880 1881 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 1882 std::vector<uint8_t> deviceAToken, deviceBToken; 1883 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn, 1884 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken); 1885 getTransformedCacheToken(model, deviceB, "deviceB", tokenIn, 1886 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken); 1887 expectUniqueTokens({deviceAToken, deviceBToken}); 1888 } 1889 1890 // Test if the runtime maps to different cache tokens for devices with different version strings in 1891 // execution plan with a simple body. 1892 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsSimpleBody) { 1893 PartitioningModel model; 1894 CreateModelForCachingTests(&model); 1895 1896 // Two devices that can both execute the whole model. 1897 const auto deviceA_1_0 = makeDevices({{"deviceA", "1.0", 0.5, ~0U}}); 1898 const auto deviceA_1_1 = makeDevices({{"deviceA", "1.1", 0.5, ~0U}}); 1899 1900 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 1901 std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token; 1902 getTransformedCacheToken(model, deviceA_1_0, "deviceA", tokenIn, 1903 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token); 1904 getTransformedCacheToken(model, deviceA_1_1, "deviceA", tokenIn, 1905 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token); 1906 expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token}); 1907 } 1908 1909 // Test if the runtime maps to different cache tokens for compilations with different preferences 1910 // in execution plan with a simple body. 1911 TEST_F(CacheTest, CacheTokenDifferentPreferencesSimpleBody) { 1912 PartitioningModel model; 1913 CreateModelForCachingTests(&model); 1914 1915 // One device that can execute the whole model. 1916 const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}}); 1917 1918 std::vector<uint8_t> fastToken, powerToken, sustainedToken; 1919 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 1920 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn, 1921 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken); 1922 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn, 1923 ExecutePreference::PREFER_LOW_POWER, &powerToken); 1924 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn, 1925 ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken); 1926 expectUniqueTokens({fastToken, powerToken, sustainedToken}); 1927 } 1928 1929 // Test if the runtime maps to different cache tokens for compilations with different tokens 1930 // provided by application in execution plan with a simple body. 1931 TEST_F(CacheTest, CacheTokenDifferentTokensSimpleBody) { 1932 PartitioningModel model; 1933 CreateModelForCachingTests(&model); 1934 1935 // One device that can execute the whole model. 1936 const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}}); 1937 1938 std::vector<uint8_t> tokenOut1, tokenOut2; 1939 std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 1940 std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1); 1941 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn1, 1942 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1); 1943 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn2, 1944 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2); 1945 expectUniqueTokens({tokenOut1, tokenOut2}); 1946 } 1947 1948 // Test the case when no token is provided by the application and the execution plan has a 1949 // compound body. 1950 TEST_F(CacheTest, CacheTokenNoneCompoundBody) { 1951 PartitioningModel model; 1952 CreateModelForCachingTests(&model); 1953 1954 // DeviceA executes the first operation only. 1955 const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}}); 1956 1957 std::vector<uint8_t> tokenIn, tokenOut; 1958 getTransformedCacheToken(model, devices, "deviceA", tokenIn, 1959 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut); 1960 EXPECT_TRUE(tokenOut.empty()); 1961 getTransformedCacheToken(model, devices, "deviceB", tokenIn, 1962 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut); 1963 EXPECT_TRUE(tokenOut.empty()); 1964 } 1965 1966 // Test if the runtime maps to different cache tokens for devices with different names in 1967 // execution plan with a compound body. 1968 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesCompoundBody) { 1969 PartitioningModel model; 1970 CreateModelForCachingTests(&model); 1971 1972 // DeviceA executes the first operation only. 1973 const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}}); 1974 // DeviceB executes the first operation only. 1975 const auto devices2 = makeDevices({{"deviceB", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}}); 1976 1977 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 1978 std::vector<uint8_t> deviceAToken, deviceBToken; 1979 getTransformedCacheToken(model, devices1, "deviceA", tokenIn, 1980 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken); 1981 getTransformedCacheToken(model, devices2, "deviceB", tokenIn, 1982 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken); 1983 expectUniqueTokens({deviceAToken, deviceBToken}); 1984 } 1985 1986 // Test if the runtime maps to different cache tokens for devices with different names in 1987 // execution plan with a compound body. 1988 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsCompoundBody) { 1989 PartitioningModel model; 1990 CreateModelForCachingTests(&model); 1991 1992 // DeviceA executes the first operation only. 1993 const auto devices1 = makeDevices({{"deviceA", "1.0", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}}); 1994 // DeviceB executes the first operation only. 1995 const auto devices2 = makeDevices({{"deviceA", "1.1", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}}); 1996 1997 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 1998 std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token; 1999 getTransformedCacheToken(model, devices1, "deviceA", tokenIn, 2000 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token); 2001 getTransformedCacheToken(model, devices2, "deviceA", tokenIn, 2002 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token); 2003 expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token}); 2004 } 2005 2006 // Test if the runtime maps to different cache tokens for compilations with different preferences 2007 // in execution plan with a compound body. 2008 TEST_F(CacheTest, CacheTokenDifferentPreferencesCompoundBody) { 2009 PartitioningModel model; 2010 CreateModelForCachingTests(&model); 2011 2012 // DeviceA executes the first operation only. 2013 const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}}); 2014 2015 std::vector<uint8_t> fastToken, powerToken, sustainedToken; 2016 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 2017 getTransformedCacheToken(model, devices, "deviceA", tokenIn, 2018 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken); 2019 getTransformedCacheToken(model, devices, "deviceA", tokenIn, 2020 ExecutePreference::PREFER_LOW_POWER, &powerToken); 2021 getTransformedCacheToken(model, devices, "deviceA", tokenIn, 2022 ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken); 2023 expectUniqueTokens({fastToken, powerToken, sustainedToken}); 2024 } 2025 2026 // Test if the runtime maps to different cache tokens for compilations with different tokens 2027 // provided by application in execution plan with a compound body. 2028 TEST_F(CacheTest, CacheTokenDifferentTokensCompoundBody) { 2029 PartitioningModel model; 2030 CreateModelForCachingTests(&model); 2031 2032 // DeviceA executes the first operation only. 2033 const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}}); 2034 2035 std::vector<uint8_t> tokenOut1, tokenOut2; 2036 std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 2037 std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1); 2038 getTransformedCacheToken(model, devices, "deviceA", tokenIn1, 2039 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1); 2040 getTransformedCacheToken(model, devices, "deviceA", tokenIn2, 2041 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2); 2042 expectUniqueTokens({tokenOut1, tokenOut2}); 2043 } 2044 2045 // Test if the runtime maps to different cache tokens for compilations with different partitioning 2046 // outcome in execution plan with a compound body. 2047 TEST_F(CacheTest, CacheTokenDifferentPartitionsCompoundBody) { 2048 PartitioningModel model; 2049 CreateModelForCachingTests(&model); 2050 2051 // DeviceA executes the whole model. 2052 const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 0U}}); 2053 // DeviceA executes the first operation only. 2054 const auto devices2 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}}); 2055 // DeviceA executes the second operation only. 2056 const auto devices3 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 0}}); 2057 2058 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0); 2059 std::vector<uint8_t> tokenOut1, tokenOut2, tokenOut3; 2060 getTransformedCacheToken(model, devices1, "deviceA", tokenIn, 2061 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1); 2062 getTransformedCacheToken(model, devices2, "deviceA", tokenIn, 2063 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2); 2064 getTransformedCacheToken(model, devices3, "deviceA", tokenIn, 2065 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut3); 2066 expectUniqueTokens({tokenOut1, tokenOut2, tokenOut3}); 2067 } 2068 2069 // Very basic tests of some of the PerformanceInfo functionality. 2070 // Placed in this file because partitioning is the consumer of this functionality. 2071 class PerfTest : public ::testing::Test {}; 2072 2073 TEST_F(PerfTest, Lookup) { 2074 // Derive an arbitrary (but reproducible) performance value from an OperandType. 2075 // We'll use this to ensure that we can save and then recover a type's performance. 2076 auto typePerf = [](OperandType type) { return float(static_cast<uint32_t>(type)); }; 2077 2078 Capabilities capabilities = makeCapabilities(-1.0f); 2079 2080 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN); 2081 type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) { 2082 OperandType operandType = static_cast<OperandType>(type); 2083 update(&capabilities, operandType, typePerf(operandType)); 2084 } 2085 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN); 2086 type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) { 2087 OperandType operandType = static_cast<OperandType>(type); 2088 update(&capabilities, operandType, typePerf(operandType)); 2089 } 2090 2091 // Make sure lookup retrieves the values stored by update 2092 2093 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN); 2094 type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) { 2095 OperandType operandType = static_cast<OperandType>(type); 2096 SCOPED_TRACE(toString(operandType)); 2097 EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType)); 2098 } 2099 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN); 2100 type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) { 2101 OperandType operandType = static_cast<OperandType>(type); 2102 SCOPED_TRACE(toString(operandType)); 2103 EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType)); 2104 } 2105 2106 // Check the behavior of a missing type 2107 2108 OperandType operandType = 2109 static_cast<OperandType>(static_cast<uint32_t>(OperandTypeRange::BASE_MAX) + 1); 2110 EXPECT_EQ(lookupExecTime(capabilities, operandType), FLT_MAX); 2111 } 2112 2113 } // namespace 2114