Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define LOG_TAG "CpuExecutor"
     18 
     19 #include "CpuExecutor.h"
     20 
     21 #include "NeuralNetworks.h"
     22 #include "OperationResolver.h"
     23 #include "Operations.h"
     24 #include "OperationsUtils.h"
     25 #include "Tracing.h"
     26 
     27 #include "Eigen/Core"
     28 // b/109953668, disable OpenMP
     29 #ifdef NNAPI_OPENMP
     30 #include <omp.h>
     31 #endif  // NNAPI_OPENMP
     32 #include <android/hardware_buffer.h>
     33 #include <sys/mman.h>
     34 
     35 namespace android {
     36 namespace nn {
     37 
     38 namespace {
     39 
     40 class OperationExecutionContext : public IOperationExecutionContext {
     41     DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
     42 
     43    public:
     44     OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
     45         : operation(operation), operands(operands) {}
     46 
     47     uint32_t getNumInputs() const override;
     48     OperandType getInputType(uint32_t index) const override;
     49     Shape getInputShape(uint32_t index) const override;
     50     const void* getInputBuffer(uint32_t index) const override;
     51     const Operand::ExtraParams getInputExtraParams(uint32_t index) const override;
     52 
     53     uint32_t getNumOutputs() const override;
     54     OperandType getOutputType(uint32_t index) const override;
     55     Shape getOutputShape(uint32_t index) const override;
     56     void* getOutputBuffer(uint32_t index) override;
     57 
     58     // Return false on failure and store the result code.
     59     // Use getResultCode() to retrieve it at the end of the operation execution.
     60     bool setOutputShape(uint32_t index, const Shape& shape) override;
     61     int getResultCode() const;
     62 
     63     bool isOmittedInput(uint32_t index) const override;
     64     bool isOmittedOutput(uint32_t index) const override;
     65 
     66     // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
     67     bool checkNoOmittedOperand() const;
     68     // Return false if any of inputs has dimension 0.
     69     bool checkNoZeroSizedInput() const;
     70 
     71    private:
     72     const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
     73     const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
     74     RunTimeOperandInfo* getOutputInfo(uint32_t index);
     75 
     76     const Operation* operation;
     77     RunTimeOperandInfo* operands;
     78 
     79     int result = ANEURALNETWORKS_NO_ERROR;
     80 };
     81 
     82 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
     83     CHECK(index < operation->inputs.size());
     84     return &operands[operation->inputs[index]];
     85 }
     86 
     87 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
     88     CHECK(index < operation->outputs.size());
     89     return &operands[operation->outputs[index]];
     90 }
     91 
     92 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
     93     CHECK(index < operation->outputs.size());
     94     return &operands[operation->outputs[index]];
     95 }
     96 
     97 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
     98     return getInputInfo(index)->type;
     99 }
    100 
    101 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
    102     return getInputInfo(index)->shape();
    103 }
    104 
    105 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
    106     return getInputInfo(index)->buffer;
    107 }
    108 
    109 const Operand::ExtraParams OperationExecutionContext::getInputExtraParams(uint32_t index) const {
    110     return getInputInfo(index)->extraParams;
    111 }
    112 
    113 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
    114     return getOutputInfo(index)->type;
    115 }
    116 
    117 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
    118     return getOutputInfo(index)->shape();
    119 }
    120 
    121 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
    122     return getOutputInfo(index)->buffer;
    123 }
    124 
    125 uint32_t OperationExecutionContext::getNumInputs() const {
    126     return operation->inputs.size();
    127 }
    128 
    129 uint32_t OperationExecutionContext::getNumOutputs() const {
    130     return operation->outputs.size();
    131 }
    132 
    133 int OperationExecutionContext::getResultCode() const {
    134     return result;
    135 }
    136 
    137 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
    138 // Updates the RunTimeOperandInfo with the newly calculated shape.
    139 // Allocate the buffer if we need to.
    140 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
    141     // For user-provided model output operands, the parameters must match the Shape
    142     // calculated from the preparation step.
    143     if (info->lifetime == OperandLifeTime::MODEL_OUTPUT) {
    144         if (info->type != shape.type) {
    145             LOG(ERROR) << "Invalid type for model output";
    146             *result = ANEURALNETWORKS_OP_FAILED;
    147             return false;
    148         }
    149         if (info->type == OperandType::TENSOR_QUANT8_ASYMM) {
    150             if (info->scale != shape.scale) {
    151                 LOG(ERROR) << "Invalid scale for model output";
    152                 *result = ANEURALNETWORKS_OP_FAILED;
    153                 return false;
    154             }
    155             if (info->zeroPoint != shape.offset) {
    156                 LOG(ERROR) << "Invalid zeroPoint for model output";
    157                 *result = ANEURALNETWORKS_OP_FAILED;
    158                 return false;
    159             }
    160         }
    161         if (info->extraParams != shape.extraParams) {
    162             LOG(ERROR) << "Invalid extraParams for model output";
    163             *result = ANEURALNETWORKS_OP_FAILED;
    164             return false;
    165         }
    166     }
    167 
    168     std::vector<uint32_t> combined;
    169     if (!combineDimensions(shape.dimensions, info->dimensions, &combined)) {
    170         LOG(ERROR) << "Invalid dimensions for model operand";
    171         *result = ANEURALNETWORKS_OP_FAILED;
    172         return false;
    173     }
    174     info->dimensions = combined;
    175     info->type = shape.type;
    176     info->scale = shape.scale;
    177     info->zeroPoint = shape.offset;
    178     info->extraParams = shape.extraParams;
    179 
    180     // Allocate the buffer only if the combined dimension is fully specified
    181     if (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info->buffer == nullptr) {
    182         if (isExtensionOperandType(info->type)) {
    183             LOG(ERROR) << "Cannot allocate a temporary variable of an extension type";
    184             *result = ANEURALNETWORKS_OP_FAILED;
    185             return false;
    186         }
    187         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
    188         if (length > 0) {
    189             info->buffer = new uint8_t[length];
    190             if (info->buffer == nullptr) {
    191                 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
    192                 return false;
    193             }
    194             info->length = length;
    195         }
    196     }
    197     if (!info->isSufficient()) {
    198         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
    199         LOG(ERROR) << "Insufficient size for model operand: require = " << length
    200                    << ", provided = " << info->length;
    201         *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
    202         return false;
    203     }
    204     *result = ANEURALNETWORKS_NO_ERROR;
    205     return true;
    206 }
    207 
    208 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
    209     return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
    210 }
    211 
    212 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
    213     return getInputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
    214 }
    215 
    216 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
    217     return getOutputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
    218 }
    219 
    220 bool OperationExecutionContext::checkNoOmittedOperand() const {
    221     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
    222         NN_RET_CHECK(!isOmittedInput(i)) << getOperationName(operation->type) << " input operand "
    223                                          << i << " is required but missing.";
    224     }
    225     for (uint32_t i = 0; i < operation->outputs.size(); i++) {
    226         NN_RET_CHECK(!isOmittedOutput(i)) << getOperationName(operation->type) << " output operand "
    227                                           << i << " is required but missing.";
    228     }
    229     return true;
    230 }
    231 
    232 bool OperationExecutionContext::checkNoZeroSizedInput() const {
    233     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
    234         if (isOmittedInput(i)) continue;
    235         for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
    236             NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
    237                     << getOperationName(operation->type)
    238                     << " does not support zero-sized tensor, but input " << i << " dimension " << j
    239                     << " is 0.";
    240         }
    241     }
    242     return true;
    243 }
    244 
    245 }  // namespace
    246 
    247 // Used to keep a pointer to a memory pool.
    248 //
    249 // In the case of an "mmap_fd" pool, owns the mmap region
    250 // returned by getBuffer() -- i.e., that region goes away
    251 // when the RunTimePoolInfo is destroyed or is assigned to.
    252 class RunTimePoolInfo::RunTimePoolInfoImpl {
    253    public:
    254     RunTimePoolInfoImpl(const hidl_memory& hidlMemory, uint8_t* buffer, const sp<IMemory>& memory,
    255                         const sp<GraphicBuffer>& graphicBuffer);
    256 
    257     // rule of five...
    258     ~RunTimePoolInfoImpl();
    259     RunTimePoolInfoImpl(const RunTimePoolInfoImpl&) = delete;
    260     RunTimePoolInfoImpl(RunTimePoolInfoImpl&&) noexcept = delete;
    261     RunTimePoolInfoImpl& operator=(const RunTimePoolInfoImpl&) = delete;
    262     RunTimePoolInfoImpl& operator=(RunTimePoolInfoImpl&&) noexcept = delete;
    263 
    264     uint8_t* getBuffer() const { return mBuffer; }
    265 
    266     bool update() const;
    267 
    268     hidl_memory getHidlMemory() const { return mHidlMemory; }
    269 
    270    private:
    271     const hidl_memory mHidlMemory;     // always used
    272     uint8_t* const mBuffer = nullptr;  // always used
    273     const sp<IMemory> mMemory;         // only used when hidlMemory.name() == "ashmem"
    274     const sp<GraphicBuffer>
    275             mGraphicBuffer;  // only used when hidlMemory.name() == "hardware_buffer_blob"
    276 };
    277 
    278 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(const hidl_memory& hidlMemory,
    279                                                           uint8_t* buffer,
    280                                                           const sp<IMemory>& memory,
    281                                                           const sp<GraphicBuffer>& graphicBuffer)
    282     : mHidlMemory(hidlMemory), mBuffer(buffer), mMemory(memory), mGraphicBuffer(graphicBuffer) {}
    283 
    284 RunTimePoolInfo::RunTimePoolInfoImpl::~RunTimePoolInfoImpl() {
    285     if (mBuffer == nullptr) {
    286         return;
    287     }
    288 
    289     const std::string memType = mHidlMemory.name();
    290     if (memType == "ashmem") {
    291         // nothing to do
    292     } else if (memType == "mmap_fd") {
    293         const size_t size = mHidlMemory.size();
    294         if (munmap(mBuffer, size)) {
    295             LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfo(): Can't munmap";
    296         }
    297     } else if (memType == "hardware_buffer_blob") {
    298         mGraphicBuffer->unlock();
    299     } else if (memType == "") {
    300         // Represents a POINTER argument; nothing to do
    301     } else {
    302         LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfoImpl(): unsupported hidl_memory type";
    303     }
    304 }
    305 
    306 // Making sure the output data are correctly updated after execution.
    307 bool RunTimePoolInfo::RunTimePoolInfoImpl::update() const {
    308     const std::string memType = mHidlMemory.name();
    309     if (memType == "ashmem") {
    310         mMemory->commit();
    311         return true;
    312     }
    313     if (memType == "mmap_fd") {
    314         int prot = mHidlMemory.handle()->data[1];
    315         if (prot & PROT_WRITE) {
    316             const size_t size = mHidlMemory.size();
    317             return msync(mBuffer, size, MS_SYNC) == 0;
    318         }
    319     }
    320     // No-op for other types of memory.
    321     return true;
    322 }
    323 
    324 // TODO: short term, make share memory mapping and updating a utility function.
    325 // TODO: long term, implement mmap_fd as a hidl IMemory service.
    326 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromHidlMemory(
    327         const hidl_memory& hidlMemory) {
    328     uint8_t* buffer = nullptr;
    329     sp<IMemory> memory;
    330     sp<GraphicBuffer> graphicBuffer;
    331 
    332     const auto& memType = hidlMemory.name();
    333     if (memType == "ashmem") {
    334         memory = mapMemory(hidlMemory);
    335         if (memory == nullptr) {
    336             LOG(ERROR) << "Can't map shared memory.";
    337             return std::nullopt;
    338         }
    339         memory->update();
    340         buffer = reinterpret_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
    341         if (buffer == nullptr) {
    342             LOG(ERROR) << "Can't access shared memory.";
    343             return std::nullopt;
    344         }
    345     } else if (memType == "mmap_fd") {
    346         size_t size = hidlMemory.size();
    347         int fd = hidlMemory.handle()->data[0];
    348         int prot = hidlMemory.handle()->data[1];
    349         size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], hidlMemory.handle()->data[3]);
    350         buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
    351         if (buffer == MAP_FAILED) {
    352             LOG(ERROR) << "RunTimePoolInfo::set(): Can't mmap the file descriptor.";
    353             return std::nullopt;
    354         }
    355     } else if (memType == "hardware_buffer_blob") {
    356         auto handle = hidlMemory.handle();
    357         auto format = AHARDWAREBUFFER_FORMAT_BLOB;
    358         auto usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
    359         const uint32_t width = hidlMemory.size();
    360         const uint32_t height = 1;  // height is always 1 for BLOB mode AHardwareBuffer.
    361         const uint32_t layers = 1;  // layers is always 1 for BLOB mode AHardwareBuffer.
    362         const uint32_t stride = hidlMemory.size();
    363         graphicBuffer = new GraphicBuffer(handle, GraphicBuffer::HandleWrapMethod::CLONE_HANDLE,
    364                                           width, height, format, layers, usage, stride);
    365         void* gBuffer = nullptr;
    366         int32_t outBytesPerPixel, outBytesPerStride;
    367         status_t status =
    368                 graphicBuffer->lock(usage, &gBuffer, &outBytesPerPixel, &outBytesPerStride);
    369         if (status != NO_ERROR) {
    370             LOG(ERROR) << "RunTimePoolInfo Can't lock the AHardwareBuffer.";
    371             return std::nullopt;
    372         }
    373         buffer = static_cast<uint8_t*>(gBuffer);
    374     } else {
    375         LOG(ERROR) << "RunTimePoolInfo::set(): unsupported hidl_memory type";
    376         return std::nullopt;
    377     }
    378 
    379     const auto impl =
    380             std::make_shared<const RunTimePoolInfoImpl>(hidlMemory, buffer, memory, graphicBuffer);
    381     return {RunTimePoolInfo(impl)};
    382 }
    383 
    384 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer) {
    385     const auto impl =
    386             std::make_shared<const RunTimePoolInfoImpl>(hidl_memory{}, buffer, nullptr, nullptr);
    387     return {impl};
    388 }
    389 
    390 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
    391     : mImpl(impl) {}
    392 
    393 uint8_t* RunTimePoolInfo::getBuffer() const {
    394     return mImpl->getBuffer();
    395 }
    396 
    397 bool RunTimePoolInfo::update() const {
    398     return mImpl->update();
    399 }
    400 
    401 hidl_memory RunTimePoolInfo::getHidlMemory() const {
    402     return mImpl->getHidlMemory();
    403 }
    404 
    405 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
    406                                          const hidl_vec<hidl_memory>& pools) {
    407     CHECK(poolInfos != nullptr);
    408     poolInfos->clear();
    409     poolInfos->reserve(pools.size());
    410     for (const auto& pool : pools) {
    411         if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(pool)) {
    412             poolInfos->push_back(*poolInfo);
    413         } else {
    414             LOG(ERROR) << "Could not map pools";
    415             poolInfos->clear();
    416             return false;
    417         }
    418     }
    419     return true;
    420 }
    421 
    422 template <typename T>
    423 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
    424     uint32_t spatialSize = fromDim[2] * fromDim[3];
    425     for (uint32_t n = 0; n < fromDim[0]; n++) {
    426         for (uint32_t hw = 0; hw < spatialSize; hw++) {
    427             for (uint32_t c = 0; c < fromDim[1]; c++) {
    428                 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
    429                 *to++ = from[fromIndex];
    430             }
    431         }
    432     }
    433     return true;
    434 }
    435 
    436 template <typename T>
    437 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
    438     uint32_t spatialSize = fromDim[1] * fromDim[2];
    439     for (uint32_t n = 0; n < fromDim[0]; n++) {
    440         for (uint32_t c = 0; c < fromDim[3]; c++) {
    441             for (uint32_t hw = 0; hw < spatialSize; hw++) {
    442                 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
    443                 *to++ = from[fromIndex];
    444             }
    445         }
    446     }
    447     return true;
    448 }
    449 
    450 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
    451                           std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
    452     int result;
    453     if (from.dimensions.size() != 4) {
    454         LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
    455         return false;
    456     }
    457     to.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
    458     if (data_layout) {
    459         // convert dimensions
    460         Shape inShape = from.shape();
    461         auto& fromDim = from.dimensions;
    462         inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
    463         // allocate buffer
    464         to.buffer = nullptr;
    465         if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
    466             return false;
    467         }
    468         ptr_guard.reset(to.buffer);
    469         // convert value
    470         if (from.type == OperandType::TENSOR_FLOAT32) {
    471             return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
    472                                             reinterpret_cast<const float*>(from.buffer), fromDim);
    473         } else if (from.type == OperandType::TENSOR_FLOAT16) {
    474             return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
    475                                                reinterpret_cast<const _Float16*>(from.buffer),
    476                                                fromDim);
    477         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
    478             return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
    479                                               reinterpret_cast<const uint8_t*>(from.buffer),
    480                                               fromDim);
    481         } else {
    482             LOG(ERROR) << "Unsupported data type";
    483             return false;
    484         }
    485     } else {
    486         to = from;
    487     }
    488     return true;
    489 }
    490 
    491 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
    492                             bool data_layout, int* result) {
    493     if (from.dimensions.size() != 4) {
    494         LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
    495         return false;
    496     }
    497     if (data_layout) {
    498         // convert dimensions
    499         Shape outShape = from.shape();
    500         auto& fromDim = from.dimensions;
    501         outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
    502         // allocate buffer
    503         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
    504             return false;
    505         }
    506         // convert value
    507         if (from.type == OperandType::TENSOR_FLOAT32) {
    508             return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
    509                                               reinterpret_cast<const float*>(from.buffer), fromDim);
    510         } else if (from.type == OperandType::TENSOR_FLOAT16) {
    511             return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
    512                                                  reinterpret_cast<const _Float16*>(from.buffer),
    513                                                  fromDim);
    514         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
    515             return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
    516                                                 reinterpret_cast<const uint8_t*>(from.buffer),
    517                                                 fromDim);
    518         } else {
    519             LOG(ERROR) << "Unsupported data type";
    520             return false;
    521         }
    522     } else {
    523         Shape outShape = from.shape();
    524         to.buffer = from.buffer;
    525         to.length = from.length;
    526         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
    527             return false;
    528         }
    529     }
    530     return true;
    531 }
    532 
    533 // Ignore the .pools entry in model and request.  This will have been taken care of
    534 // by the caller.
    535 int CpuExecutor::run(const Model& model, const Request& request,
    536                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
    537                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
    538     NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
    539     VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")";
    540 
    541     // b/109953668, disable OpenMP
    542 #ifdef NNAPI_OPENMP
    543     ScopedOpenmpSettings openMpSettings;
    544 #endif  // NNAPI_OPENMP
    545 
    546     mModel = &model;
    547     mRequest = &request;  // TODO check if mRequest is needed
    548     initializeRunTimeInfo(modelPoolInfos, requestPoolInfos);
    549     // The model has serialized the operation in execution order.
    550     for (const auto& operation : model.operations) {
    551         int n = executeOperation(operation);
    552         if (n != ANEURALNETWORKS_NO_ERROR) {
    553             finish(n);
    554             return n;
    555         }
    556     }
    557     for (auto& runtimeInfo : modelPoolInfos) {
    558         runtimeInfo.update();
    559     }
    560     for (auto& runtimeInfo : requestPoolInfos) {
    561         runtimeInfo.update();
    562     }
    563     finish(ANEURALNETWORKS_NO_ERROR);
    564     VLOG(CPUEXE) << "Completed run normally";
    565     return ANEURALNETWORKS_NO_ERROR;
    566 }
    567 
    568 bool CpuExecutor::initializeRunTimeInfo(const std::vector<RunTimePoolInfo>& modelPoolInfos,
    569                                         const std::vector<RunTimePoolInfo>& requestPoolInfos) {
    570     VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
    571     const size_t count = mModel->operands.size();
    572     mOperands.resize(count);
    573 
    574     // Start by setting the runtime info to what's in the model.
    575     for (size_t i = 0; i < count; i++) {
    576         const Operand& from = mModel->operands[i];
    577         RunTimeOperandInfo& to = mOperands[i];
    578         to.type = from.type;
    579         to.dimensions = from.dimensions;
    580         to.scale = from.scale;
    581         to.zeroPoint = from.zeroPoint;
    582         to.length = from.location.length;
    583         to.lifetime = from.lifetime;
    584         to.extraParams = from.extraParams;
    585         switch (from.lifetime) {
    586             case OperandLifeTime::TEMPORARY_VARIABLE:
    587                 to.buffer = nullptr;
    588                 to.numberOfUsesLeft = from.numberOfConsumers;
    589                 break;
    590             case OperandLifeTime::CONSTANT_COPY:
    591                 to.buffer = const_cast<uint8_t*>(&mModel->operandValues[from.location.offset]);
    592                 to.numberOfUsesLeft = 0;
    593                 break;
    594             case OperandLifeTime::CONSTANT_REFERENCE: {
    595                 auto poolIndex = from.location.poolIndex;
    596                 nnAssert(poolIndex < modelPoolInfos.size());
    597                 auto& r = modelPoolInfos[poolIndex];
    598                 to.buffer = r.getBuffer() + from.location.offset;
    599                 to.numberOfUsesLeft = 0;
    600                 break;
    601             }
    602             case OperandLifeTime::MODEL_INPUT:
    603             case OperandLifeTime::MODEL_OUTPUT:
    604             case OperandLifeTime::NO_VALUE:
    605                 to.buffer = nullptr;
    606                 to.numberOfUsesLeft = 0;
    607                 break;
    608             default:
    609                 nnAssert(false);
    610                 break;
    611         }
    612     }
    613 
    614     // Adjust the runtime info for the arguments passed to the model,
    615     // modifying the buffer location, and possibly the dimensions.
    616     auto updateForArguments = [this, &requestPoolInfos](
    617                                       const std::vector<uint32_t>& indexes,
    618                                       const hidl_vec<RequestArgument>& arguments) {
    619         nnAssert(indexes.size() == arguments.size());
    620         for (size_t i = 0; i < indexes.size(); i++) {
    621             const uint32_t operandIndex = indexes[i];
    622             const RequestArgument& from = arguments[i];
    623             RunTimeOperandInfo& to = mOperands[operandIndex];
    624             if (from.dimensions.size() > 0) {
    625                 // It's the responsibility of the caller to validate that
    626                 // from.dimensions only modifies the dimensions that were
    627                 // unspecified in the model.  That's the case in SampleDriver.cpp
    628                 // with the call to validateRequest().
    629                 // TODO make sure that's the case for the default CPU path.
    630                 to.dimensions = from.dimensions;
    631             }
    632             if (from.hasNoValue) {
    633                 to.lifetime = OperandLifeTime::NO_VALUE;
    634                 nnAssert(to.buffer == nullptr);
    635                 to.length = 0;
    636             } else {
    637                 auto poolIndex = from.location.poolIndex;
    638                 nnAssert(poolIndex < requestPoolInfos.size());
    639                 auto& r = requestPoolInfos[poolIndex];
    640                 to.buffer = r.getBuffer() + from.location.offset;
    641                 to.length = from.location.length;
    642             }
    643         }
    644     };
    645     updateForArguments(mModel->inputIndexes, mRequest->inputs);
    646     updateForArguments(mModel->outputIndexes, mRequest->outputs);
    647 
    648     return true;
    649 }
    650 
    651 void CpuExecutor::freeNoLongerUsedOperands(const std::vector<uint32_t>& inputs) {
    652     for (uint32_t i : inputs) {
    653         auto& info = mOperands[i];
    654         // Check if it's a static or model input/output.
    655         if (info.numberOfUsesLeft == 0) {
    656             continue;
    657         }
    658         info.numberOfUsesLeft--;
    659         if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
    660             delete[] info.buffer;
    661             info.buffer = nullptr;
    662         }
    663     }
    664 }
    665 
    666 int CpuExecutor::executeOperation(const Operation& operation) {
    667     // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
    668     const hidl_vec<uint32_t>& ins = operation.inputs;
    669     const hidl_vec<uint32_t>& outs = operation.outputs;
    670     bool success = false;
    671     int result = ANEURALNETWORKS_NO_ERROR;
    672 
    673     // Function to verify that the number of input and output parameters
    674     // matches what is expected.  Also checks that all the parameters have
    675     // values. This function is to be used only for operations that do not
    676     // accept optional arguments.
    677     // TODO Have a version that works for optional arguments.
    678     auto allParametersPresent = [&operation, &ins, &outs, this](size_t requiredIns,
    679                                                                 size_t requiredOuts) -> bool {
    680         auto verify = [&operation, this](size_t requiredCount, const hidl_vec<uint32_t>& indexes,
    681                                          const char* type) -> bool {
    682             size_t actualCount = indexes.size();
    683             if (actualCount != requiredCount) {
    684                 LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of " << type
    685                            << " operands. Got " << actualCount << " of " << requiredCount;
    686                 return false;
    687             }
    688             for (size_t i = 0; i < actualCount; i++) {
    689                 if (mOperands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
    690                     LOG(ERROR) << getOperationName(operation.type) << " " << type << " operand "
    691                                << i << " is required but missing.";
    692                     return false;
    693                 }
    694             }
    695             return true;
    696         };
    697 
    698         auto verifyNoZeroSizedInputs = [&operation, this](const hidl_vec<uint32_t>& indexes) {
    699             for (size_t i = 0; i < indexes.size(); i++) {
    700                 for (size_t j = 0; j < mOperands[indexes[i]].dimensions.size(); j++) {
    701                     if (mOperands[indexes[i]].dimensions[j] == 0) {
    702                         LOG(ERROR) << getOperationName(operation.type)
    703                                    << " does not support zero-sized tensor, but input " << i
    704                                    << " dimension " << j << " is zero.";
    705                         return false;
    706                     }
    707                 }
    708             }
    709             return true;
    710         };
    711 
    712         return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
    713                verifyNoZeroSizedInputs(ins);
    714     };
    715 
    716     switch (operation.type) {
    717         case OperationType::OEM_OPERATION: {
    718             LOG(ERROR) << "OEM operation not supported for CPU execution";
    719             success = false;
    720         } break;
    721         case OperationType::FLOOR: {
    722             if (!allParametersPresent(1, 1)) {
    723                 return ANEURALNETWORKS_BAD_DATA;
    724             }
    725             const RunTimeOperandInfo& input = mOperands[ins[0]];
    726             RunTimeOperandInfo& output = mOperands[outs[0]];
    727             Shape outShape = output.shape();
    728 
    729             if (!floorPrepare(input.shape(), &outShape) ||
    730                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
    731                 break;
    732             }
    733             if (input.type == OperandType::TENSOR_FLOAT32) {
    734                 success = floorFloat32(reinterpret_cast<const float*>(input.buffer),
    735                                        reinterpret_cast<float*>(output.buffer), outShape);
    736             } else if (input.type == OperandType::TENSOR_FLOAT16) {
    737                 success = floorFloat16(reinterpret_cast<const _Float16*>(input.buffer),
    738                                        reinterpret_cast<_Float16*>(output.buffer), outShape);
    739             }
    740         } break;
    741         case OperationType::DEPTHWISE_CONV_2D: {
    742             const size_t inCount = ins.size();
    743             if ((inCount != 14 && inCount != 12 && inCount != 11 && inCount != 9 && inCount != 8) ||
    744                 !allParametersPresent(inCount, 1)) {
    745                 return ANEURALNETWORKS_BAD_DATA;
    746             }
    747             const RunTimeOperandInfo& input = mOperands[ins[0]];
    748             const RunTimeOperandInfo& filter = mOperands[ins[1]];
    749             const RunTimeOperandInfo& bias = mOperands[ins[2]];
    750 
    751             int32_t padding_left, padding_right;
    752             int32_t padding_top, padding_bottom;
    753             int32_t padding_implicit = 0;
    754             int32_t stride_width, stride_height;
    755             int32_t dilation_width_factor = 1, dilation_height_factor = 1;
    756             int32_t depth_multiplier;
    757             int32_t activation;
    758             bool data_layout = false;
    759             bool useImplicitPadding = false;
    760 
    761             if ((inCount >= 9 && mOperands[ins[8]].type == OperandType::BOOL) || inCount == 8) {
    762                 padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]);
    763                 stride_width = getScalarData<int32_t>(mOperands[ins[4]]);
    764                 stride_height = getScalarData<int32_t>(mOperands[ins[5]]);
    765                 depth_multiplier = getScalarData<int32_t>(mOperands[ins[6]]);
    766                 activation = getScalarData<int32_t>(mOperands[ins[7]]);
    767                 if (inCount >= 9) {
    768                     data_layout = getScalarData<bool>(mOperands[ins[8]]);
    769                 }
    770                 if (inCount == 11) {
    771                     dilation_width_factor = getScalarData<int32_t>(mOperands[ins[9]]);
    772                     dilation_height_factor = getScalarData<int32_t>(mOperands[ins[10]]);
    773                 }
    774                 useImplicitPadding = true;
    775             } else if (inCount >= 11 && mOperands[ins[8]].type == OperandType::INT32) {
    776                 padding_left = getScalarData<int32_t>(mOperands[ins[3]]);
    777                 padding_right = getScalarData<int32_t>(mOperands[ins[4]]);
    778                 padding_top = getScalarData<int32_t>(mOperands[ins[5]]);
    779                 padding_bottom = getScalarData<int32_t>(mOperands[ins[6]]);
    780                 stride_width = getScalarData<int32_t>(mOperands[ins[7]]);
    781                 stride_height = getScalarData<int32_t>(mOperands[ins[8]]);
    782                 depth_multiplier = getScalarData<int32_t>(mOperands[ins[9]]);
    783                 activation = getScalarData<int32_t>(mOperands[ins[10]]);
    784                 if (inCount >= 12) {
    785                     data_layout = getScalarData<bool>(mOperands[ins[11]]);
    786                 }
    787                 if (inCount == 14) {
    788                     dilation_width_factor = getScalarData<int32_t>(mOperands[ins[12]]);
    789                     dilation_height_factor = getScalarData<int32_t>(mOperands[ins[13]]);
    790                 }
    791             } else {
    792                 return ANEURALNETWORKS_BAD_DATA;
    793             }
    794 
    795             RunTimeOperandInfo& output = mOperands[outs[0]];
    796             Shape outShape = output.shape();
    797 
    798             RunTimeOperandInfo input_tmp, output_tmp;
    799             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
    800             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
    801                 success = false;
    802                 break;
    803             }
    804             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
    805             output_tmp.buffer = data_layout ? nullptr : output.buffer;
    806             output_tmp.length = data_layout ? 0 : output.length;
    807 
    808             if (useImplicitPadding) {
    809                 Shape inputShape = input_tmp.shape();
    810                 Shape filterShape = filter.shape();
    811                 int32_t input_width = getSizeOfDimension(inputShape, 2);
    812                 int32_t input_height = getSizeOfDimension(inputShape, 1);
    813                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
    814                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
    815                 calculateExplicitPadding(input_width, stride_width, dilation_width_factor,
    816                                          filter_width, padding_implicit, &padding_left,
    817                                          &padding_right);
    818                 calculateExplicitPadding(input_height, stride_height, dilation_height_factor,
    819                                          filter_height, padding_implicit, &padding_top,
    820                                          &padding_bottom);
    821             }
    822 
    823             if (!depthwiseConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
    824                                       padding_right, padding_top, padding_bottom, stride_width,
    825                                       stride_height, depth_multiplier, dilation_width_factor,
    826                                       dilation_height_factor, &outShape) ||
    827                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
    828                 if (!data_layout) output.dimensions = output_tmp.dimensions;
    829                 success = false;
    830                 break;
    831             }
    832             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
    833                 success = depthwiseConvFloat32(
    834                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
    835                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
    836                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
    837                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
    838                         dilation_width_factor, dilation_height_factor, depth_multiplier, activation,
    839                         reinterpret_cast<float*>(output_tmp.buffer), outShape);
    840             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
    841                 success = depthwiseConvFloat16(
    842                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
    843                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
    844                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
    845                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
    846                         dilation_width_factor, dilation_height_factor, depth_multiplier, activation,
    847                         reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
    848             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
    849                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
    850                     success = depthwiseConvQuant8PerChannel(
    851                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
    852                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
    853                             filter.extraParams.channelQuant().scales.data(),
    854                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
    855                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
    856                             stride_height, dilation_width_factor, dilation_height_factor,
    857                             depth_multiplier, activation,
    858                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
    859                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
    860                     success = depthwiseConvQuant8(
    861                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
    862                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
    863                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
    864                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
    865                             stride_height, dilation_width_factor, dilation_height_factor,
    866                             depth_multiplier, activation,
    867                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
    868                 }
    869             }
    870             if (data_layout) {
    871                 output_tmp_guard.reset(output_tmp.buffer);
    872             }
    873             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
    874                 success = false;
    875                 break;
    876             }
    877         } break;
    878         case OperationType::LOCAL_RESPONSE_NORMALIZATION: {
    879             const size_t inCount = ins.size();
    880             if ((inCount != 6 && inCount != 5) || !allParametersPresent(inCount, 1)) {
    881                 return ANEURALNETWORKS_BAD_DATA;
    882             }
    883             const RunTimeOperandInfo& input = mOperands[ins[0]];
    884             int32_t radius = getScalarData<int32_t>(mOperands[ins[1]]);
    885             float bias = (input.type == OperandType::TENSOR_FLOAT16)
    886                                  ? getScalarData<_Float16>(mOperands[ins[2]])
    887                                  : getScalarData<float>(mOperands[ins[2]]);
    888             float alpha = (input.type == OperandType::TENSOR_FLOAT16)
    889                                   ? getScalarData<_Float16>(mOperands[ins[3]])
    890                                   : getScalarData<float>(mOperands[ins[3]]);
    891             float beta = (input.type == OperandType::TENSOR_FLOAT16)
    892                                  ? getScalarData<_Float16>(mOperands[ins[4]])
    893                                  : getScalarData<float>(mOperands[ins[4]]);
    894             const int32_t axis = inCount == 6 ? getScalarData<int32_t>(mOperands[ins[5]]) : -1;
    895 
    896             RunTimeOperandInfo& output = mOperands[outs[0]];
    897             Shape outShape = output.shape();
    898 
    899             if (!genericNormalizationPrepare(input.shape(), &outShape) ||
    900                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
    901                 success = false;
    902                 break;
    903             }
    904             if (input.type == OperandType::TENSOR_FLOAT32) {
    905                 success = localResponseNormFloat32(
    906                         reinterpret_cast<const float*>(input.buffer), input.shape(), radius, bias,
    907                         alpha, beta, axis, reinterpret_cast<float*>(output.buffer), outShape);
    908             } else if (input.type == OperandType::TENSOR_FLOAT16) {
    909                 success = localResponseNormFloat16(reinterpret_cast<const _Float16*>(input.buffer),
    910                                                    input.shape(), radius, bias, alpha, beta, axis,
    911                                                    reinterpret_cast<_Float16*>(output.buffer),
    912                                                    outShape);
    913             }
    914         } break;
    915         case OperationType::RESHAPE: {
    916             if (!allParametersPresent(2, 1)) {
    917                 return ANEURALNETWORKS_BAD_DATA;
    918             }
    919             const RunTimeOperandInfo& input = mOperands[ins[0]];
    920             const RunTimeOperandInfo& targetShape = mOperands[ins[1]];
    921 
    922             RunTimeOperandInfo& output = mOperands[outs[0]];
    923             Shape outShape = output.shape();
    924 
    925             success = reshapePrepare(input.shape(),
    926                                      reinterpret_cast<const int32_t*>(targetShape.buffer),
    927                                      getNumberOfElements(targetShape.shape()), &outShape) &&
    928                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
    929                       copyData(input.buffer, input.shape(), output.buffer, outShape);
    930         } break;
    931         case OperationType::DEPTH_TO_SPACE: {
    932             const size_t inCount = ins.size();
    933             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
    934                 return ANEURALNETWORKS_BAD_DATA;
    935             }
    936             const RunTimeOperandInfo& input = mOperands[ins[0]];
    937             int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]);
    938             bool data_layout = inCount == 3 ? getScalarData<bool>(mOperands[ins[2]]) : false;
    939 
    940             RunTimeOperandInfo& output = mOperands[outs[0]];
    941             Shape outShape = output.shape();
    942 
    943             RunTimeOperandInfo input_tmp, output_tmp;
    944             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
    945             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
    946                 success = false;
    947                 break;
    948             }
    949             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
    950             output_tmp.buffer = data_layout ? nullptr : output.buffer;
    951             output_tmp.length = data_layout ? 0 : output.length;
    952             if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
    953                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
    954                 if (!data_layout) output.dimensions = output_tmp.dimensions;
    955                 break;
    956             }
    957             switch (input_tmp.type) {
    958                 case OperandType::TENSOR_FLOAT32: {
    959                     success = depthToSpaceGeneric(
    960                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
    961                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
    962                     break;
    963                 }
    964                 case OperandType::TENSOR_FLOAT16: {
    965                     success = depthToSpaceGeneric(
    966                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
    967                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
    968                     break;
    969                 }
    970                 case OperandType::TENSOR_QUANT8_ASYMM: {
    971                     success = depthToSpaceGeneric(
    972                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
    973                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
    974                     break;
    975                 }
    976                 default: {
    977                     LOG(ERROR) << "Unsupported data type";
    978                     success = false;
    979                 }
    980             }
    981             if (data_layout) {
    982                 output_tmp_guard.reset(output_tmp.buffer);
    983             }
    984             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
    985                 success = false;
    986                 break;
    987             }
    988         } break;
    989         case OperationType::SPACE_TO_DEPTH: {
    990             const size_t inCount = ins.size();
    991             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
    992                 return ANEURALNETWORKS_BAD_DATA;
    993             }
    994             const RunTimeOperandInfo& input = mOperands[ins[0]];
    995             int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]);
    996             bool data_layout = inCount == 3 ? getScalarData<bool>(mOperands[ins[2]]) : false;
    997 
    998             RunTimeOperandInfo& output = mOperands[outs[0]];
    999             Shape outShape = output.shape();
   1000 
   1001             RunTimeOperandInfo input_tmp, output_tmp;
   1002             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
   1003             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
   1004                 success = false;
   1005                 break;
   1006             }
   1007             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
   1008             output_tmp.buffer = data_layout ? nullptr : output.buffer;
   1009             output_tmp.length = data_layout ? 0 : output.length;
   1010 
   1011             if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
   1012                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
   1013                 if (!data_layout) output.dimensions = output_tmp.dimensions;
   1014                 break;
   1015             }
   1016             switch (input_tmp.type) {
   1017                 case OperandType::TENSOR_FLOAT32: {
   1018                     success = spaceToDepthGeneric(
   1019                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
   1020                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
   1021                     break;
   1022                 }
   1023                 case OperandType::TENSOR_FLOAT16: {
   1024                     success = spaceToDepthGeneric(
   1025                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
   1026                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
   1027                     break;
   1028                 }
   1029                 case OperandType::TENSOR_QUANT8_ASYMM: {
   1030                     success = spaceToDepthGeneric(
   1031                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
   1032                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
   1033                     break;
   1034                 }
   1035                 default: {
   1036                     LOG(ERROR) << "Unsupported data type";
   1037                     success = false;
   1038                 }
   1039             }
   1040             if (data_layout) {
   1041                 output_tmp_guard.reset(output_tmp.buffer);
   1042             }
   1043             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
   1044                 success = false;
   1045                 break;
   1046             }
   1047         } break;
   1048         case OperationType::EMBEDDING_LOOKUP: {
   1049             const RunTimeOperandInfo& values = mOperands[ins[EmbeddingLookup::kValueTensor]];
   1050             const RunTimeOperandInfo& lookups = mOperands[ins[EmbeddingLookup::kLookupTensor]];
   1051             RunTimeOperandInfo& output = mOperands[outs[EmbeddingLookup::kOutputTensor]];
   1052 
   1053             Shape outputShape;
   1054             EmbeddingLookup lookup(operation, mOperands);
   1055 
   1056             success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
   1057                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
   1058         } break;
   1059         case OperationType::HASHTABLE_LOOKUP: {
   1060             const RunTimeOperandInfo& lookups = mOperands[ins[HashtableLookup::kLookupTensor]];
   1061             const RunTimeOperandInfo& keys = mOperands[ins[HashtableLookup::kKeyTensor]];
   1062             const RunTimeOperandInfo& values = mOperands[ins[HashtableLookup::kValueTensor]];
   1063 
   1064             RunTimeOperandInfo& output = mOperands[outs[HashtableLookup::kOutputTensor]];
   1065             RunTimeOperandInfo& hits = mOperands[outs[HashtableLookup::kHitsTensor]];
   1066 
   1067             Shape outputShape, hitShape;
   1068             HashtableLookup lookup(operation, mOperands);
   1069 
   1070             success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
   1071                                              &outputShape, &hitShape) &&
   1072                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
   1073                       setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
   1074         } break;
   1075         case OperationType::LSH_PROJECTION: {
   1076             RunTimeOperandInfo& output = mOperands[outs[LSHProjection::kOutputTensor]];
   1077             Shape outputShape;
   1078             if (!LSHProjection::Prepare(operation, mOperands, &outputShape) ||
   1079                 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
   1080                 break;
   1081             }
   1082 
   1083             LSHProjection lsh(operation, mOperands);
   1084             const RunTimeOperandInfo& hash = mOperands[ins[LSHProjection::kHashTensor]];
   1085             switch (hash.type) {
   1086                 case OperandType::TENSOR_FLOAT32: {
   1087                     success = lsh.Eval<float>();
   1088                     break;
   1089                 }
   1090                 case OperandType::TENSOR_FLOAT16: {
   1091                     success = lsh.Eval<_Float16>();
   1092                     break;
   1093                 }
   1094                 default: {
   1095                     success = false;
   1096                     LOG(ERROR) << "Unsupported data type";
   1097                 }
   1098             }
   1099         } break;
   1100         case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
   1101             const auto merge_outputs = getScalarData<bool>(
   1102                     mOperands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
   1103             RunTimeOperandInfo& fwOutput =
   1104                     mOperands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
   1105             Shape fwOutputShape, bwOutputShape;
   1106 
   1107             BidirectionalSequenceLSTM lstm(operation, mOperands);
   1108             success = lstm.Prepare(operation, mOperands, &fwOutputShape, &bwOutputShape) &&
   1109                       setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
   1110             if (!merge_outputs) {
   1111                 RunTimeOperandInfo& bwOutput =
   1112                         mOperands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
   1113                 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
   1114             }
   1115             success = success && lstm.Eval();
   1116         } break;
   1117         case OperationType::LSTM: {
   1118             RunTimeOperandInfo& scratch = mOperands[outs[LSTMCell::kScratchBufferTensor]];
   1119             RunTimeOperandInfo& outputStateOut = mOperands[outs[LSTMCell::kOutputStateOutTensor]];
   1120             RunTimeOperandInfo& cellStateOut = mOperands[outs[LSTMCell::kCellStateOutTensor]];
   1121             RunTimeOperandInfo& output = mOperands[outs[LSTMCell::kOutputTensor]];
   1122 
   1123             Shape scratchShape, outputStateShape, cellStateShape, outputShape;
   1124             LSTMCell lstm_cell(operation, mOperands);
   1125 
   1126             success = lstm_cell.Prepare(operation, mOperands, &scratchShape, &outputStateShape,
   1127                                         &cellStateShape, &outputShape) &&
   1128                       setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
   1129                       setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
   1130                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
   1131                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
   1132         } break;
   1133         case OperationType::RANDOM_MULTINOMIAL: {
   1134             const RunTimeOperandInfo& lookups = mOperands[ins[HashtableLookup::kLookupTensor]];
   1135             const RunTimeOperandInfo& keys = mOperands[ins[HashtableLookup::kKeyTensor]];
   1136             const RunTimeOperandInfo& values = mOperands[ins[HashtableLookup::kValueTensor]];
   1137             RunTimeOperandInfo& output = mOperands[outs[Multinomial::kOutputTensor]];
   1138 
   1139             Shape outputShape;
   1140             Multinomial multinomial(operation, mOperands);
   1141 
   1142             success = Multinomial::Prepare(operation, mOperands, &outputShape) &&
   1143                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
   1144                       multinomial.Eval();
   1145         } break;
   1146         case OperationType::RNN: {
   1147             RunTimeOperandInfo& hiddenStateOut = mOperands[outs[RNN::kHiddenStateOutTensor]];
   1148             RunTimeOperandInfo& output = mOperands[outs[RNN::kOutputTensor]];
   1149 
   1150             Shape hiddenStateShape, outputShape;
   1151             RNN rnn_cell(operation, mOperands);
   1152 
   1153             success = RNN::Prepare(operation, mOperands, &hiddenStateShape, &outputShape) &&
   1154                       setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
   1155                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
   1156         } break;
   1157         case OperationType::SVDF: {
   1158             RunTimeOperandInfo& stateOut = mOperands[outs[SVDF::kStateOutTensor]];
   1159             RunTimeOperandInfo& output = mOperands[outs[SVDF::kOutputTensor]];
   1160 
   1161             Shape stateShape, outputShape;
   1162             SVDF svdf(operation, mOperands);
   1163 
   1164             success = SVDF::Prepare(operation, mOperands, &stateShape, &outputShape) &&
   1165                       setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
   1166                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
   1167         } break;
   1168         case OperationType::BATCH_TO_SPACE_ND: {
   1169             const size_t inCount = ins.size();
   1170             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
   1171                 return ANEURALNETWORKS_BAD_DATA;
   1172             }
   1173             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1174             const RunTimeOperandInfo& blockSize = mOperands[ins[1]];
   1175             bool data_layout = inCount == 3 ? getScalarData<bool>(mOperands[ins[2]]) : false;
   1176 
   1177             RunTimeOperandInfo& output = mOperands[outs[0]];
   1178             Shape outShape = output.shape();
   1179 
   1180             RunTimeOperandInfo input_tmp, output_tmp;
   1181             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
   1182             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
   1183                 success = false;
   1184                 break;
   1185             }
   1186             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
   1187             output_tmp.buffer = data_layout ? nullptr : output.buffer;
   1188             output_tmp.length = data_layout ? 0 : output.length;
   1189 
   1190             if (!batchToSpacePrepare(input_tmp.shape(),
   1191                                      reinterpret_cast<const int32_t*>(blockSize.buffer),
   1192                                      blockSize.shape(), &outShape) ||
   1193                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
   1194                 if (!data_layout) output.dimensions = output_tmp.dimensions;
   1195                 break;
   1196             }
   1197             switch (input_tmp.type) {
   1198                 case OperandType::TENSOR_FLOAT32: {
   1199                     success = batchToSpaceGeneric(
   1200                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
   1201                             reinterpret_cast<const int32_t*>(blockSize.buffer),
   1202                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
   1203                     break;
   1204                 }
   1205                 case OperandType::TENSOR_FLOAT16: {
   1206                     success = batchToSpaceGeneric(
   1207                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
   1208                             reinterpret_cast<const int32_t*>(blockSize.buffer),
   1209                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
   1210                     break;
   1211                 }
   1212                 case OperandType::TENSOR_QUANT8_ASYMM: {
   1213                     success = batchToSpaceGeneric(
   1214                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
   1215                             reinterpret_cast<const int32_t*>(blockSize.buffer),
   1216                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
   1217                     break;
   1218                 }
   1219                 default: {
   1220                     LOG(ERROR) << "Unsupported data type";
   1221                     success = false;
   1222                 }
   1223             }
   1224             if (data_layout) {
   1225                 output_tmp_guard.reset(output_tmp.buffer);
   1226             }
   1227             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
   1228                 success = false;
   1229                 break;
   1230             }
   1231         } break;
   1232         case OperationType::SPACE_TO_BATCH_ND: {
   1233             const size_t inCount = ins.size();
   1234             if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
   1235                 return ANEURALNETWORKS_BAD_DATA;
   1236             }
   1237             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1238             const RunTimeOperandInfo& blockSize = mOperands[ins[1]];
   1239             const RunTimeOperandInfo& paddings = mOperands[ins[2]];
   1240             bool data_layout = inCount == 4 ? getScalarData<bool>(mOperands[ins[3]]) : false;
   1241 
   1242             RunTimeOperandInfo& output = mOperands[outs[0]];
   1243             Shape outShape = output.shape();
   1244 
   1245             RunTimeOperandInfo input_tmp, output_tmp;
   1246             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
   1247             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
   1248                 success = false;
   1249                 break;
   1250             }
   1251             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
   1252             output_tmp.buffer = data_layout ? nullptr : output.buffer;
   1253             output_tmp.length = data_layout ? 0 : output.length;
   1254 
   1255             if (!spaceToBatchPrepare(
   1256                         input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
   1257                         blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
   1258                         paddings.shape(), &outShape) ||
   1259                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
   1260                 if (!data_layout) output.dimensions = output_tmp.dimensions;
   1261                 break;
   1262             }
   1263             switch (input_tmp.type) {
   1264                 case OperandType::TENSOR_FLOAT32: {
   1265                     success = spaceToBatchGeneric(
   1266                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
   1267                             reinterpret_cast<const int32_t*>(blockSize.buffer),
   1268                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
   1269                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
   1270                     break;
   1271                 }
   1272                 case OperandType::TENSOR_FLOAT16: {
   1273                     success = spaceToBatchGeneric(
   1274                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
   1275                             reinterpret_cast<const int32_t*>(blockSize.buffer),
   1276                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
   1277                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
   1278                     break;
   1279                 }
   1280                 case OperandType::TENSOR_QUANT8_ASYMM: {
   1281                     success = spaceToBatchGeneric(
   1282                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
   1283                             reinterpret_cast<const int32_t*>(blockSize.buffer),
   1284                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
   1285                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
   1286                     break;
   1287                 }
   1288                 default: {
   1289                     LOG(ERROR) << "Unsupported data type";
   1290                     success = false;
   1291                 }
   1292             }
   1293             if (data_layout) {
   1294                 output_tmp_guard.reset(output_tmp.buffer);
   1295             }
   1296             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
   1297                 success = false;
   1298                 break;
   1299             }
   1300         } break;
   1301         case OperationType::PAD:
   1302         case OperationType::PAD_V2: {
   1303             const bool isV2 = operation.type == OperationType::PAD_V2;
   1304             if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
   1305                 return ANEURALNETWORKS_BAD_DATA;
   1306             }
   1307             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1308             const RunTimeOperandInfo& paddings = mOperands[ins[1]];
   1309 
   1310             RunTimeOperandInfo& output = mOperands[outs[0]];
   1311             Shape outShape = output.shape();
   1312 
   1313             if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
   1314                             paddings.shape(), &outShape) ||
   1315                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
   1316                 break;
   1317             }
   1318             if (input.type == OperandType::TENSOR_FLOAT32) {
   1319                 float pad_value = isV2 ? getScalarData<float>(mOperands[ins[2]]) : 0;
   1320                 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
   1321                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
   1322                                      reinterpret_cast<float*>(output.buffer), outShape);
   1323             } else if (input.type == OperandType::TENSOR_FLOAT16) {
   1324                 _Float16 pad_value = isV2 ? getScalarData<_Float16>(mOperands[ins[2]]) : 0;
   1325                 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
   1326                                      reinterpret_cast<const int32_t*>(paddings.buffer),
   1327                                      static_cast<_Float16>(pad_value),
   1328                                      reinterpret_cast<_Float16*>(output.buffer), outShape);
   1329             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
   1330                 uint8_t pad_value =
   1331                         isV2 ? getScalarData<uint8_t>(mOperands[ins[2]]) : outShape.offset;
   1332                 success = padGeneric(input.buffer, input.shape(),
   1333                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
   1334                                      output.buffer, outShape);
   1335             }
   1336         } break;
   1337         case OperationType::CAST: {
   1338             if (!allParametersPresent(1, 1)) {
   1339                 return ANEURALNETWORKS_BAD_DATA;
   1340             }
   1341             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1342 
   1343             RunTimeOperandInfo& output = mOperands[outs[0]];
   1344             Shape outShape = output.shape();
   1345 
   1346             success = cast::prepare(input.shape(), &outShape) &&
   1347                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1348                       cast::eval(input.buffer, input.shape(), output.buffer, outShape);
   1349         } break;
   1350         case OperationType::SQUEEZE: {
   1351             if (ins.size() != 2 || outs.size() != 1 ||
   1352                 mOperands[ins[0]].lifetime == OperandLifeTime::NO_VALUE ||
   1353                 mOperands[outs[0]].lifetime == OperandLifeTime::NO_VALUE) {
   1354                 LOG(ERROR) << "Wrong input/output count or lifetime for SQUEEZE op.";
   1355                 return ANEURALNETWORKS_BAD_DATA;
   1356             }
   1357             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1358             const RunTimeOperandInfo& squeezeDims = mOperands[ins[1]];
   1359 
   1360             RunTimeOperandInfo& output = mOperands[outs[0]];
   1361             Shape outShape = output.shape();
   1362 
   1363             success = squeezePrepare(input.shape(),
   1364                                      reinterpret_cast<const int32_t*>(squeezeDims.buffer),
   1365                                      squeezeDims.shape(), &outShape) &&
   1366                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1367                       copyData(input.buffer, input.shape(), output.buffer, outShape);
   1368         } break;
   1369         case OperationType::STRIDED_SLICE: {
   1370             if (!allParametersPresent(7, 1)) {
   1371                 return ANEURALNETWORKS_BAD_DATA;
   1372             }
   1373             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1374             const RunTimeOperandInfo& begins = mOperands[ins[1]];
   1375             const RunTimeOperandInfo& ends = mOperands[ins[2]];
   1376             const RunTimeOperandInfo& strides = mOperands[ins[3]];
   1377             int32_t beginMask = getScalarData<int32_t>(mOperands[ins[4]]);
   1378             int32_t endMask = getScalarData<int32_t>(mOperands[ins[5]]);
   1379             int32_t shrinkAxisMask = getScalarData<int32_t>(mOperands[ins[6]]);
   1380 
   1381             RunTimeOperandInfo& output = mOperands[outs[0]];
   1382             Shape outShape = output.shape();
   1383 
   1384             success =
   1385                     stridedSlicePrepare(
   1386                             input.shape(), reinterpret_cast<const int32_t*>(begins.buffer),
   1387                             begins.shape(), reinterpret_cast<const int32_t*>(ends.buffer),
   1388                             ends.shape(), reinterpret_cast<const int32_t*>(strides.buffer),
   1389                             strides.shape(), beginMask, endMask, shrinkAxisMask, &outShape) &&
   1390                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1391                     stridedSliceGeneric(input.buffer, input.shape(),
   1392                                         reinterpret_cast<const int32_t*>(begins.buffer),
   1393                                         reinterpret_cast<const int32_t*>(ends.buffer),
   1394                                         reinterpret_cast<const int32_t*>(strides.buffer), beginMask,
   1395                                         endMask, shrinkAxisMask, output.buffer, outShape);
   1396         } break;
   1397         case OperationType::MEAN: {
   1398             if (!allParametersPresent(3, 1)) {
   1399                 return ANEURALNETWORKS_BAD_DATA;
   1400             }
   1401             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1402             const RunTimeOperandInfo& axis = mOperands[ins[1]];
   1403             int32_t keepDims = getScalarData<int32_t>(mOperands[ins[2]]);
   1404 
   1405             RunTimeOperandInfo& output = mOperands[outs[0]];
   1406             Shape outShape = output.shape();
   1407 
   1408             if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
   1409                              axis.shape(), keepDims > 0, &outShape) ||
   1410                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
   1411                 break;
   1412             }
   1413             if (input.type == OperandType::TENSOR_FLOAT16) {
   1414                 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
   1415                                       reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
   1416                                       keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
   1417                                       outShape);
   1418             } else if (input.type == OperandType::TENSOR_FLOAT32) {
   1419                 success = meanGeneric<float, float>(
   1420                         reinterpret_cast<float*>(input.buffer), input.shape(),
   1421                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
   1422                         reinterpret_cast<float*>(output.buffer), outShape);
   1423             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
   1424                 success = meanGeneric<uint8_t, int32_t>(
   1425                         reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
   1426                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
   1427                         reinterpret_cast<uint8_t*>(output.buffer), outShape);
   1428             }
   1429         } break;
   1430         case OperationType::ARGMAX:
   1431         case OperationType::ARGMIN: {
   1432             if (!allParametersPresent(2, 1)) {
   1433                 return ANEURALNETWORKS_BAD_DATA;
   1434             }
   1435             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1436             int32_t axis = getScalarData<int32_t>(mOperands[ins[1]]);
   1437 
   1438             RunTimeOperandInfo& output = mOperands[outs[0]];
   1439             Shape outShape = output.shape();
   1440 
   1441             const bool isArgMin = operation.type == OperationType::ARGMIN;
   1442             success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
   1443                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1444                       argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
   1445                                        outShape);
   1446         } break;
   1447         case OperationType::EXPAND_DIMS: {
   1448             if (!allParametersPresent(2, 1)) {
   1449                 return ANEURALNETWORKS_BAD_DATA;
   1450             }
   1451             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1452             int32_t axis = getScalarData<int32_t>(mOperands[ins[1]]);
   1453 
   1454             RunTimeOperandInfo& output = mOperands[outs[0]];
   1455             Shape outShape = output.shape();
   1456 
   1457             success = expand_dims::prepare(input.shape(), axis, &outShape) &&
   1458                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1459                       expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
   1460         } break;
   1461         case OperationType::SPLIT: {
   1462             if (ins.size() != 3) {
   1463                 LOG(ERROR) << "Wrong input count";
   1464                 return ANEURALNETWORKS_BAD_DATA;
   1465             }
   1466 
   1467             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1468             const int32_t axis = getScalarData<int32_t>(mOperands[ins[1]]);
   1469             const int32_t numOutputs = getScalarData<int32_t>(mOperands[ins[2]]);
   1470 
   1471             if (numOutputs != outs.size()) {
   1472                 return ANEURALNETWORKS_BAD_DATA;
   1473             }
   1474 
   1475             std::vector<Shape> outputShapes(numOutputs);
   1476             for (int i = 0; i < numOutputs; ++i) {
   1477                 outputShapes[i] = mOperands[outs[i]].shape();
   1478             }
   1479 
   1480             success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
   1481             for (int i = 0; i < numOutputs; ++i) {
   1482                 success = success && setInfoAndAllocateIfNeeded(&(mOperands[outs[i]]),
   1483                                                                 outputShapes[i], &result);
   1484             }
   1485             switch (input.type) {
   1486                 case OperandType::TENSOR_FLOAT16: {
   1487                     std::vector<_Float16*> outputDataPtrs(numOutputs);
   1488                     for (int i = 0; i < numOutputs; ++i) {
   1489                         outputDataPtrs[i] = reinterpret_cast<_Float16*>(mOperands[outs[i]].buffer);
   1490                     }
   1491                     success = success &&
   1492                               splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
   1493                                            input.shape(), axis, &outputDataPtrs, outputShapes);
   1494                 } break;
   1495                 case OperandType::TENSOR_FLOAT32: {
   1496                     std::vector<float*> outputDataPtrs(numOutputs);
   1497                     for (int i = 0; i < numOutputs; ++i) {
   1498                         outputDataPtrs[i] = reinterpret_cast<float*>(mOperands[outs[i]].buffer);
   1499                     }
   1500                     success = success &&
   1501                               splitFloat32(reinterpret_cast<const float*>(input.buffer),
   1502                                            input.shape(), axis, &outputDataPtrs, outputShapes);
   1503                 } break;
   1504                 case OperandType::TENSOR_INT32: {
   1505                     std::vector<int32_t*> outputDataPtrs(numOutputs);
   1506                     for (int i = 0; i < numOutputs; ++i) {
   1507                         outputDataPtrs[i] = reinterpret_cast<int32_t*>(mOperands[outs[i]].buffer);
   1508                     }
   1509                     success = success &&
   1510                               splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
   1511                                          input.shape(), axis, &outputDataPtrs, outputShapes);
   1512                 } break;
   1513                 case OperandType::TENSOR_QUANT8_ASYMM: {
   1514                     std::vector<uint8_t*> outputDataPtrs(numOutputs);
   1515                     for (int i = 0; i < numOutputs; ++i) {
   1516                         outputDataPtrs[i] = reinterpret_cast<uint8_t*>(mOperands[outs[i]].buffer);
   1517                     }
   1518                     success = success &&
   1519                               splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
   1520                                           input.shape(), axis, &outputDataPtrs, outputShapes);
   1521                 } break;
   1522                 default: {
   1523                     return ANEURALNETWORKS_BAD_DATA;
   1524                 }
   1525             }
   1526         } break;
   1527         case OperationType::MAXIMUM:
   1528         case OperationType::MINIMUM: {
   1529             if (!allParametersPresent(2, 1)) {
   1530                 return ANEURALNETWORKS_BAD_DATA;
   1531             }
   1532             const RunTimeOperandInfo& in1 = mOperands[ins[0]];
   1533             const RunTimeOperandInfo& in2 = mOperands[ins[1]];
   1534 
   1535             RunTimeOperandInfo& output = mOperands[outs[0]];
   1536             Shape outputShape = output.shape();
   1537 
   1538             const bool isMinimum = operation.type == OperationType::MINIMUM;
   1539             success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
   1540                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
   1541                       maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
   1542                                             isMinimum, output.buffer, outputShape);
   1543         } break;
   1544         case OperationType::GROUPED_CONV_2D: {
   1545             const size_t inCount = ins.size();
   1546             if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
   1547                 return ANEURALNETWORKS_BAD_DATA;
   1548             }
   1549             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1550             const RunTimeOperandInfo& filter = mOperands[ins[1]];
   1551             const RunTimeOperandInfo& bias = mOperands[ins[2]];
   1552 
   1553             int32_t padding_left, padding_right;
   1554             int32_t padding_top, padding_bottom;
   1555             int32_t padding_implicit = 0;
   1556             int32_t stride_width, stride_height;
   1557             int32_t numGroups;
   1558             int32_t activation;
   1559             bool data_layout = false;
   1560 
   1561             if (inCount == 12) {
   1562                 padding_left = getScalarData<int32_t>(mOperands[ins[3]]);
   1563                 padding_right = getScalarData<int32_t>(mOperands[ins[4]]);
   1564                 padding_top = getScalarData<int32_t>(mOperands[ins[5]]);
   1565                 padding_bottom = getScalarData<int32_t>(mOperands[ins[6]]);
   1566                 stride_width = getScalarData<int32_t>(mOperands[ins[7]]);
   1567                 stride_height = getScalarData<int32_t>(mOperands[ins[8]]);
   1568                 numGroups = getScalarData<int32_t>(mOperands[ins[9]]);
   1569                 activation = getScalarData<int32_t>(mOperands[ins[10]]);
   1570                 data_layout = getScalarData<bool>(mOperands[ins[11]]);
   1571             } else {
   1572                 padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]);
   1573                 stride_width = getScalarData<int32_t>(mOperands[ins[4]]);
   1574                 stride_height = getScalarData<int32_t>(mOperands[ins[5]]);
   1575                 numGroups = getScalarData<int32_t>(mOperands[ins[6]]);
   1576                 activation = getScalarData<int32_t>(mOperands[ins[7]]);
   1577                 data_layout = getScalarData<bool>(mOperands[ins[8]]);
   1578             }
   1579 
   1580             RunTimeOperandInfo& output = mOperands[outs[0]];
   1581             Shape outShape = output.shape();
   1582 
   1583             RunTimeOperandInfo input_tmp, output_tmp;
   1584             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
   1585             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
   1586                 success = false;
   1587                 break;
   1588             }
   1589             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
   1590             output_tmp.buffer = data_layout ? nullptr : output.buffer;
   1591             output_tmp.length = data_layout ? 0 : output.length;
   1592 
   1593             if (inCount == 9) {
   1594                 Shape inputShape = input_tmp.shape();
   1595                 Shape filterShape = filter.shape();
   1596                 int32_t input_width = getSizeOfDimension(inputShape, 2);
   1597                 int32_t input_height = getSizeOfDimension(inputShape, 1);
   1598                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
   1599                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
   1600                 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
   1601                                          &padding_left, &padding_right);
   1602                 calculateExplicitPadding(input_height, stride_height, filter_height,
   1603                                          padding_implicit, &padding_top, &padding_bottom);
   1604             }
   1605 
   1606             if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
   1607                                     padding_right, padding_top, padding_bottom, stride_width,
   1608                                     stride_height, numGroups, &outShape) ||
   1609                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
   1610                 if (!data_layout) output.dimensions = output_tmp.dimensions;
   1611                 success = false;
   1612                 break;
   1613             }
   1614 
   1615             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
   1616                 success = groupedConvFloat32(
   1617                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
   1618                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
   1619                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
   1620                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
   1621                         numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
   1622                         outShape);
   1623             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
   1624                 success = groupedConvFloat16(
   1625                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
   1626                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
   1627                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
   1628                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
   1629                         numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
   1630                         outShape);
   1631             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
   1632                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
   1633                     success = groupedConvQuant8PerChannel(
   1634                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
   1635                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
   1636                             filter.extraParams.channelQuant().scales.data(),
   1637                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
   1638                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
   1639                             stride_height, numGroups, activation,
   1640                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
   1641                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
   1642                     success = groupedConvQuant8(
   1643                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
   1644                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
   1645                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
   1646                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
   1647                             stride_height, numGroups, activation,
   1648                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
   1649                 }
   1650             }
   1651 
   1652             if (data_layout) {
   1653                 output_tmp_guard.reset(output_tmp.buffer);
   1654             }
   1655             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
   1656                 success = false;
   1657                 break;
   1658             }
   1659         } break;
   1660         case OperationType::TILE: {
   1661             if (!allParametersPresent(2, 1)) {
   1662                 return ANEURALNETWORKS_BAD_DATA;
   1663             }
   1664             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1665             const RunTimeOperandInfo& multiples = mOperands[ins[1]];
   1666 
   1667             RunTimeOperandInfo& output = mOperands[outs[0]];
   1668             Shape outShape = output.shape();
   1669 
   1670             success =
   1671                     tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
   1672                                   multiples.shape(), &outShape) &&
   1673                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1674                     tile::eval(input.buffer, input.shape(),
   1675                                reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
   1676                                outShape);
   1677         } break;
   1678         case OperationType::QUANTIZED_16BIT_LSTM: {
   1679             if (!allParametersPresent(15, 2)) {
   1680                 return ANEURALNETWORKS_BAD_DATA;
   1681             }
   1682 
   1683             RunTimeOperandInfo& cellStateOut =
   1684                     mOperands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
   1685             RunTimeOperandInfo& output = mOperands[outs[QuantizedLSTMCell::kOutputTensor]];
   1686 
   1687             Shape cellStateOutShape, outputShape;
   1688             QuantizedLSTMCell quantizedLSTMCell(operation, mOperands);
   1689 
   1690             success = QuantizedLSTMCell::prepare(operation, mOperands, &cellStateOutShape,
   1691                                                  &outputShape) &&
   1692                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
   1693                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
   1694                       quantizedLSTMCell.eval();
   1695         } break;
   1696         case OperationType::POW: {
   1697             if (!allParametersPresent(2, 1)) {
   1698                 return ANEURALNETWORKS_BAD_DATA;
   1699             }
   1700             const RunTimeOperandInfo& base = mOperands[ins[0]];
   1701             const RunTimeOperandInfo& exponent = mOperands[ins[1]];
   1702 
   1703             RunTimeOperandInfo& output = mOperands[outs[0]];
   1704             Shape outShape = output.shape();
   1705 
   1706             success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
   1707                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
   1708                       pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
   1709                                 output.buffer, outShape);
   1710         } break;
   1711         case OperationType::TOPK_V2: {
   1712             if (!allParametersPresent(2, 2)) {
   1713                 return ANEURALNETWORKS_BAD_DATA;
   1714             }
   1715             const RunTimeOperandInfo& input = mOperands[ins[0]];
   1716             int32_t k = getScalarData<int32_t>(mOperands[ins[1]]);
   1717 
   1718             RunTimeOperandInfo& values = mOperands[outs[0]];
   1719             Shape valuesShape = values.shape();
   1720             RunTimeOperandInfo& indices = mOperands[outs[1]];
   1721             Shape indicesShape = indices.shape();
   1722 
   1723             success = topk_v2::prepare(input.shape(), k, &valuesShape, &indicesShape) &&
   1724                       setInfoAndAllocateIfNeeded(&values, valuesShape, &result) &&
   1725                       setInfoAndAllocateIfNeeded(&indices, indicesShape, &result) &&
   1726                       topk_v2::eval(input.buffer, input.shape(), k, values.buffer, valuesShape,
   1727                                     indices.buffer, indicesShape);
   1728         } break;
   1729         default: {
   1730             const OperationRegistration* operationRegistration =
   1731                     mOperationResolver->findOperation(operation.type);
   1732             if (operationRegistration == nullptr) {
   1733                 LOG(ERROR) << getOperationName(operation.type) << " not registered";
   1734             } else if (operationRegistration->prepare == nullptr ||
   1735                        operationRegistration->execute == nullptr) {
   1736                 LOG(ERROR) << "Incomplete operation registration: "
   1737                            << getOperationName(operation.type);
   1738             } else {
   1739                 OperationExecutionContext context(&operation, mOperands.data());
   1740                 success = operationRegistration->flags.allowOmittedOperand ||
   1741                           context.checkNoOmittedOperand();
   1742                 success = success && (operationRegistration->flags.allowZeroSizedInput ||
   1743                                       context.checkNoZeroSizedInput());
   1744                 success = success && operationRegistration->prepare(&context) &&
   1745                           operationRegistration->execute(&context);
   1746                 result = context.getResultCode();
   1747             }
   1748         }
   1749     }
   1750     if (!success && result == ANEURALNETWORKS_NO_ERROR) {
   1751         result = ANEURALNETWORKS_OP_FAILED;
   1752     }
   1753     if (result != ANEURALNETWORKS_NO_ERROR) {
   1754         LOG(ERROR) << getOperationName(operation.type) << " failed.";
   1755         return result;
   1756     }
   1757 
   1758     freeNoLongerUsedOperands(ins);
   1759     return ANEURALNETWORKS_NO_ERROR;
   1760 }
   1761 
   1762 void CpuExecutor::finish(int result) {
   1763     // Free allocated temporary operands.
   1764     for (auto& info : mOperands) {
   1765         if (info.lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info.buffer != nullptr) {
   1766             delete[] info.buffer;
   1767             info.buffer = nullptr;
   1768         }
   1769     }
   1770 
   1771     // Only report the output shapes when the result code is NO_ERROR or
   1772     // OUTPUT_INSUFFICIENT_SIZE.
   1773     if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
   1774         const auto& outputs = mModel->outputIndexes;
   1775         mOutputShapes.resize(outputs.size());
   1776         for (uint32_t i = 0; i < outputs.size(); i++) {
   1777             const uint32_t operandIndex = outputs[i];
   1778             RunTimeOperandInfo& from = mOperands[operandIndex];
   1779             mOutputShapes[i].dimensions = from.dimensions;
   1780             mOutputShapes[i].isSufficient = from.isSufficient();
   1781         }
   1782     } else {
   1783         mOutputShapes.clear();
   1784     }
   1785 
   1786     mModel = nullptr;
   1787     mRequest = nullptr;
   1788     mFinished = true;
   1789 }
   1790 
   1791 // b/109953668, disable OpenMP
   1792 #ifdef NNAPI_OPENMP
   1793 ScopedOpenmpSettings::ScopedOpenmpSettings() {
   1794     mBlocktimeInitial = kmp_get_blocktime();
   1795     kmp_set_blocktime(20);  // ms, see b/109645291
   1796 
   1797 #if NNAPI_LIMIT_CPU_THREADS
   1798     // Code not yet enabled. Choosing the number of threads to be based on
   1799     // benchmarking. See longer comment by the class declaration.
   1800     mMaxThreadsInitial = Eigen::nbThreads();
   1801     const int nProcs = omp_get_num_procs();
   1802     int threads = nProcs;
   1803     if (nProcs >= 8) {
   1804         threads = nProcs - 4;
   1805     } else if (nProcs >= 4) {
   1806         threads = nProcs - 2;
   1807     }
   1808     Eigen::setNbThreads(threads);
   1809 #endif
   1810 }
   1811 
   1812 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
   1813     kmp_set_blocktime(mBlocktimeInitial);
   1814 #if NNAPI_LIMIT_CPU_THREADS
   1815     Eigen::setNbThreads(mMaxThreadsInitial);
   1816 #endif
   1817 }
   1818 #endif  // NNAPI_OPENMP
   1819 
   1820 }  // namespace nn
   1821 }  // namespace android
   1822