Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 
     18 #include "rsCpuIntrinsic.h"
     19 #include "rsCpuIntrinsicInlines.h"
     20 #include "rsCpuBLASDispatch.h"
     21 #include "eight_bit_int_gemm.h"
     22 
     23 using namespace android;
     24 using namespace android::renderscript;
     25 
     26 namespace android {
     27 namespace renderscript {
     28 
     29 
     30 class RsdCpuScriptIntrinsicBLAS : public RsdCpuScriptIntrinsic {
     31 public:
     32     void invokeForEach(uint32_t slot,
     33                        const Allocation ** ain,
     34                        uint32_t inLen,
     35                        Allocation * aout,
     36                        const void * usr,
     37                        uint32_t usrLen,
     38                        const RsScriptCall *sc) override;
     39 
     40     void populateScript(Script *) override;
     41     ~RsdCpuScriptIntrinsicBLAS() override;
     42     RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s);
     43 
     44 protected:
     45 
     46     uint8_t a_offset = 0;
     47     uint8_t b_offset = 0;
     48     uint8_t c_offset = 0;
     49 
     50 #ifdef RS_COMPATIBILITY_LIB
     51     bool isBlasLibInitialized = false;
     52 #endif
     53     static void kernelBNNM(size_t m, size_t n, size_t k,
     54                            const uint8_t* a, uint8_t a_offset, size_t lda,
     55                            const uint8_t* b, uint8_t b_offset, size_t ldb,
     56                            uint8_t* c, int32_t c_offset, size_t ldc,
     57                            int32_t c_mult_int);
     58 
     59 
     60 
     61 };
     62 
     63 }
     64 }
     65 
     66 void RsdCpuScriptIntrinsicBLAS::populateScript(Script *s) {
     67     s->mHal.info.exportedVariableCount = 0;
     68 }
     69 
     70 static void initABC(const Allocation ** ain,
     71                     size_t size,
     72                     void** A,
     73                     void** B,
     74                     void** C,
     75                     int* lda,
     76                     int* ldb,
     77                     int* ldc)
     78 {
     79     if (ain[0]) {
     80         *A = ain[0]->mHal.drvState.lod[0].mallocPtr;
     81         *lda = (int)(ain[0]->mHal.drvState.lod[0].stride/size);
     82     }
     83     if (ain[1]) {
     84         *B = ain[1]->mHal.drvState.lod[0].mallocPtr;
     85         *ldb = (int)(ain[1]->mHal.drvState.lod[0].stride/size);
     86     }
     87     if (ain[2]) {
     88         *C = ain[2]->mHal.drvState.lod[0].mallocPtr;
     89         *ldc = (int)(ain[2]->mHal.drvState.lod[0].stride/size);
     90     }
     91 
     92 
     93 }
     94 
     95 void RsdCpuScriptIntrinsicBLAS::invokeForEach(uint32_t slot,
     96                                               const Allocation ** ain,
     97                                               uint32_t inLen,
     98                                               Allocation * aout,
     99                                               const void * usr,
    100                                               uint32_t usrLen,
    101                                               const RsScriptCall *sc) {
    102     RsBlasCall* call = (RsBlasCall*) usr;
    103     // setup BLAS enum args
    104     enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA;
    105     enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB;
    106     enum CBLAS_UPLO Uplo = (enum CBLAS_UPLO)call->uplo;
    107     enum CBLAS_DIAG Diag = (enum CBLAS_DIAG)call->diag;
    108     enum CBLAS_SIDE Side = (enum CBLAS_SIDE)call->side;
    109 
    110     void *A = nullptr;
    111     void *B = nullptr;
    112     void *C = nullptr;
    113     void *X = nullptr;
    114     void *Y = nullptr;
    115 
    116     int lda = 0, ldb = 0, ldc = 0;
    117 
    118 #ifdef RS_COMPATIBILITY_LIB
    119     // Allow BNNM even without libblas
    120     if (call->func != RsBlas_bnnm && !isBlasLibInitialized) {
    121         if (!loadBLASLib()) {
    122             ALOGE("Failed to load the BLAS lib, IntrinsicBLAS NOT supported!\n");
    123             return;
    124         }
    125         isBlasLibInitialized = true;
    126     }
    127 #endif
    128 
    129     switch (call->func) {
    130 
    131     // Level 1 BLAS: returns into a 1D Allocation
    132 
    133 
    134     // Level 2 BLAS
    135     case (RsBlas_sgemv):
    136         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
    137         cblas_sgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.f, (float*)A,
    138                     lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
    139         break;
    140     case (RsBlas_sgbmv):
    141         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
    142         cblas_sgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
    143                     call->alpha.f, (float*)A, lda, (float*)X, call->incX,
    144                     call->beta.f, (float*)Y, call->incY);
    145         break;
    146     case (RsBlas_strmv):
    147         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
    148         cblas_strmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
    149                     lda, (float*)X, call->incX);
    150         break;
    151     case (RsBlas_stbmv):
    152         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
    153         cblas_stbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
    154                     lda, (float*)X, call->incX);
    155         break;
    156     // stpmv takes a packed 1D Allocation only
    157     case (RsBlas_stpmv):
    158         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
    159         cblas_stpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
    160                     (float*)X, call->incX);
    161         break;
    162     case (RsBlas_strsv):
    163         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
    164         cblas_strsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda,
    165                     (float*)X, call->incX);
    166         break;
    167     case (RsBlas_stbsv):
    168         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
    169         cblas_stbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
    170                     lda, (float*)X, call->incX);
    171         break;
    172     case (RsBlas_stpsv):
    173         initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
    174         cblas_stpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
    175                     (float*)X, call->incX);
    176         break;
    177     case (RsBlas_dgemv):
    178         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
    179         cblas_dgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.d, (double*)A,
    180                     lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
    181         break;
    182     case (RsBlas_dgbmv):
    183         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
    184         cblas_dgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
    185                     call->alpha.d, (double*)A, lda, (double*)X, call->incX,
    186                     call->beta.d, (double*)Y, call->incY);
    187         break;
    188     case (RsBlas_dtrmv):
    189         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
    190         cblas_dtrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
    191                     lda, (double*)X, call->incX);
    192         break;
    193     case (RsBlas_dtbmv):
    194         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
    195         cblas_dtbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
    196                     lda, (double*)X, call->incX);
    197         break;
    198     // stpmv takes a packed 1D Allocation only
    199     case (RsBlas_dtpmv):
    200         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
    201         cblas_dtpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
    202                     (double*)X, call->incX);
    203         break;
    204     case (RsBlas_dtrsv):
    205         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
    206         cblas_dtrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda,
    207                     (double*)X, call->incX);
    208         break;
    209     case (RsBlas_dtbsv):
    210         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
    211         cblas_dtbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
    212                     lda, (double*)X, call->incX);
    213         break;
    214     case (RsBlas_dtpsv):
    215         initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
    216         cblas_dtpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
    217                     (double*)X, call->incX);
    218         break;
    219     case (RsBlas_cgemv):
    220         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    221         cblas_cgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.c, (void*)A,
    222                     lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY);
    223         break;
    224     case (RsBlas_cgbmv):
    225         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    226         cblas_cgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
    227                     (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX,
    228                     (void*)&call->beta.c, (void*)Y, call->incY);
    229         break;
    230     case (RsBlas_ctrmv):
    231         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    232         cblas_ctrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
    233                     lda, (void*)X, call->incX);
    234         break;
    235     case (RsBlas_ctbmv):
    236         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    237         cblas_ctbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
    238                     lda, (void*)X, call->incX);
    239         break;
    240     // stpmv takes a packed 1D Allocation only
    241     case (RsBlas_ctpmv):
    242         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    243         cblas_ctpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
    244                     (void*)X, call->incX);
    245         break;
    246     case (RsBlas_ctrsv):
    247         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    248         cblas_ctrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
    249                     (void*)X, call->incX);
    250         break;
    251     case (RsBlas_ctbsv):
    252         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    253         cblas_ctbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
    254                     lda, (void*)X, call->incX);
    255         break;
    256     case (RsBlas_ctpsv):
    257         initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    258         cblas_ctpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
    259                     (void*)X, call->incX);
    260         break;
    261     case (RsBlas_zgemv):
    262         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    263         cblas_zgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.z, (void*)A,
    264                     lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY);
    265         break;
    266     case (RsBlas_zgbmv):
    267         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    268         cblas_zgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
    269                     (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX,
    270                     (void*)&call->beta.z, (void*)Y, call->incY);
    271         break;
    272     case (RsBlas_ztrmv):
    273         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    274         cblas_ztrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
    275                     lda, (void*)X, call->incX);
    276         break;
    277     case (RsBlas_ztbmv):
    278         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    279         cblas_ztbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
    280                     lda, (void*)X, call->incX);
    281         break;
    282     // stpmv takes a packed 1D Allocation only
    283     case (RsBlas_ztpmv):
    284         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    285         cblas_ztpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
    286                     (void*)X, call->incX);
    287         break;
    288     case (RsBlas_ztrsv):
    289         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    290         cblas_ztrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
    291                     (void*)X, call->incX);
    292         break;
    293     case (RsBlas_ztbsv):
    294         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    295         cblas_ztbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
    296                     lda, (void*)X, call->incX);
    297         break;
    298     case (RsBlas_ztpsv):
    299         initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
    300         cblas_ztpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
    301                     (void*)X, call->incX);
    302         break;
    303 
    304 
    305     // S and D only
    306     case (RsBlas_ssymv):
    307         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
    308         cblas_ssymv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, lda,
    309                     (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
    310         break;
    311     case (RsBlas_ssbmv):
    312         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
    313         cblas_ssbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.f,
    314                     (float*)A, lda, (float*)X, call->incX, call->beta.f,
    315                     (float*)Y, call->incY);
    316         break;
    317     //sspmv requires a packed 1D Allocation
    318     case (RsBlas_sspmv):
    319         initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
    320         cblas_sspmv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A,
    321                     (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
    322         break;
    323     // following calls have init reordered because A is output matrix
    324     case (RsBlas_sger):
    325         initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
    326         cblas_sger(CblasRowMajor, call->M, call->N, call->alpha.f, (float*)X,
    327                    call->incX, (float*)Y, call->incY, (float*)A, lda);
    328         break;
    329     case (RsBlas_ssyr):
    330         initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
    331         cblas_ssyr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
    332                    (float*)A, lda);
    333         break;
    334     // sspr is packed 1D Allocation A only
    335     case (RsBlas_sspr):
    336         initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
    337         cblas_sspr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
    338                    (float*)A);
    339         break;
    340     case (RsBlas_ssyr2):
    341         initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
    342         cblas_ssyr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
    343                     (float*)Y, call->incY, (float*)A, lda);
    344         break;
    345     // sspr2 is packed 1D Allocation A only
    346     case (RsBlas_sspr2):
    347         initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
    348         cblas_sspr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
    349                     (float*)Y, call->incY, (float*)A);
    350         break;
    351     case (RsBlas_dsymv):
    352         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
    353         cblas_dsymv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, lda,
    354                     (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
    355         break;
    356     case (RsBlas_dsbmv):
    357         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
    358         cblas_dsbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.d,
    359                     (double*)A, lda, (double*)X, call->incX, call->beta.d,
    360                     (double*)Y, call->incY);
    361         break;
    362     // dspmv requires a packed 1D Allocation
    363     case (RsBlas_dspmv):
    364         initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
    365         cblas_dspmv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A,
    366                     (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
    367         break;
    368     // following calls have init reordered because A is output matrix
    369     case (RsBlas_dger):
    370         initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
    371         cblas_dger(CblasRowMajor, call->M, call->N, call->alpha.d, (double*)X,
    372                    call->incX, (double*)Y, call->incY, (double*)A, lda);
    373         break;
    374     case (RsBlas_dsyr):
    375         initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
    376         cblas_dsyr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
    377                    (double*)A, lda);
    378         break;
    379     // dspr is packed 1D Allocation A only
    380     case (RsBlas_dspr):
    381         initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
    382         cblas_dspr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
    383                    (double*)A);
    384         break;
    385     case (RsBlas_dsyr2):
    386         initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
    387         cblas_dsyr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
    388                     (double*)Y, call->incY, (double*)A, lda);
    389         break;
    390     // dspr2 is packed 1D Allocation A only
    391     case (RsBlas_dspr2):
    392         initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
    393         cblas_dspr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
    394                     (double*)Y, call->incY, (double*)A);
    395         break;
    396 
    397     // C and Z only
    398     case (RsBlas_chemv):
    399         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    400         cblas_chemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, lda,
    401                     X, call->incX, (void*)&call->beta.c, Y, call->incY);
    402         break;
    403     case (RsBlas_chbmv):
    404         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    405         cblas_chbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.c,
    406                     A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY);
    407         break;
    408     case (RsBlas_chpmv):
    409         initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    410         cblas_chpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A,
    411                     X, call->incX, (void*)&call->beta.c, Y, call->incY);
    412         break;
    413     case (RsBlas_cgeru):
    414         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    415         cblas_cgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
    416                     X, call->incX, Y, call->incY, A, lda);
    417         break;
    418     case (RsBlas_cgerc):
    419         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    420         cblas_cgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
    421                     X, call->incX, Y, call->incY, A, lda);
    422         break;
    423     case (RsBlas_cher):
    424         initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
    425         cblas_cher(CblasRowMajor, Uplo, call->N, call->alpha.f,
    426                    X, call->incX, A, lda);
    427         break;
    428     // packed 1D Allocations only
    429     case (RsBlas_chpr):
    430         initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
    431         cblas_chpr(CblasRowMajor, Uplo, call->N, call->alpha.f, X,
    432                    call->incX, A);
    433         break;
    434     case (RsBlas_cher2):
    435         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    436         cblas_cher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c,
    437                    X, call->incX, Y, call->incY, A, lda);
    438         break;
    439     // packed 1D Allocations only
    440     case (RsBlas_chpr2):
    441         initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    442         cblas_chpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X,
    443                    call->incX, Y, call->incY, A);
    444         break;
    445     case (RsBlas_zhemv):
    446         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    447         cblas_zhemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, lda,
    448                     X, call->incX, (void*)&call->beta.z, Y, call->incY);
    449         break;
    450     case (RsBlas_zhbmv):
    451         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    452         cblas_zhbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.z,
    453                     A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY);
    454         break;
    455     case (RsBlas_zhpmv):
    456         initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
    457         cblas_zhpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A,
    458                     X, call->incX, (void*)&call->beta.z, Y, call->incY);
    459         break;
    460     case (RsBlas_zgeru):
    461         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    462         cblas_zgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
    463                     X, call->incX, Y, call->incY, A, lda);
    464         break;
    465     case (RsBlas_zgerc):
    466         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    467         cblas_zgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
    468                     X, call->incX, Y, call->incY, A, lda);
    469         break;
    470     case (RsBlas_zher):
    471         initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
    472         cblas_zher(CblasRowMajor, Uplo, call->N, call->alpha.d,
    473                    X, call->incX, A, lda);
    474         break;
    475     // packed 1D Allocations only
    476     case (RsBlas_zhpr):
    477         initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
    478         cblas_zhpr(CblasRowMajor, Uplo, call->N, call->alpha.d, X,
    479                    call->incX, A);
    480         break;
    481     case (RsBlas_zher2):
    482         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    483         cblas_zher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z,
    484                    X, call->incX, Y, call->incY, A, lda);
    485         break;
    486     // packed 1D Allocations only
    487     case (RsBlas_zhpr2):
    488         initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
    489         cblas_zhpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X,
    490                    call->incX, Y, call->incY, A);
    491         break;
    492 
    493     // Level 3 BLAS
    494     case (RsBlas_sgemm):
    495         initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
    496         cblas_sgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.f,
    497                     (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
    498         break;
    499     case (RsBlas_ssymm):
    500         initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
    501         cblas_ssymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.f, (float*)A,
    502                     lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
    503         break;
    504     case (RsBlas_ssyrk):
    505         initABC(ain, sizeof(float), &A, nullptr, &C, &lda, nullptr, &ldc);
    506         cblas_ssyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
    507                     lda, call->beta.f, (float*)C, ldc);
    508         break;
    509     case (RsBlas_ssyr2k):
    510         initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
    511         cblas_ssyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
    512                      lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
    513         break;
    514     case (RsBlas_strmm):
    515         initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
    516         cblas_strmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
    517                     (float*)A, lda, (float*)B, ldb);
    518         break;
    519     case (RsBlas_strsm):
    520         initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
    521         cblas_strsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
    522                     (float*)A, lda, (float*)B, ldb);
    523         break;
    524 
    525 
    526     case (RsBlas_dgemm):
    527         initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
    528         cblas_dgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.d,
    529                     (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
    530         break;
    531     case (RsBlas_dsymm):
    532         initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
    533         cblas_dsymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.d, (double*)A,
    534                     lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
    535         break;
    536     case (RsBlas_dsyrk):
    537         initABC(ain, sizeof(double), &A, nullptr, &C, &lda, nullptr, &ldc);
    538         cblas_dsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
    539                     lda, call->beta.d, (double*)C, ldc);
    540         break;
    541     case (RsBlas_dsyr2k):
    542         initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
    543         cblas_dsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
    544                      lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
    545         break;
    546     case (RsBlas_dtrmm):
    547         initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
    548         cblas_dtrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
    549                     (double*)A, lda, (double*)B, ldb);
    550         break;
    551     case (RsBlas_dtrsm):
    552         initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
    553         cblas_dtrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
    554                     (double*)A, lda, (double*)B, ldb);
    555         break;
    556 
    557     case (RsBlas_cgemm):
    558         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
    559         cblas_cgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.c,
    560                     A, lda, B, ldb, (void*)&call->beta.c, C, ldc);
    561         break;
    562     case (RsBlas_csymm):
    563         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
    564         cblas_csymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A,
    565                     lda, B, ldb, (void*)&call->beta.c, C, ldc);
    566         break;
    567     case (RsBlas_csyrk):
    568         initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
    569         cblas_csyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
    570                     lda, (void*)&call->beta.c, C, ldc);
    571         break;
    572     case (RsBlas_csyr2k):
    573         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
    574         cblas_csyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
    575                      lda, B, ldb, (void*)&call->beta.c, C, ldc);
    576         break;
    577     case (RsBlas_ctrmm):
    578         initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
    579         cblas_ctrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
    580                     A, lda, B, ldb);
    581         break;
    582     case (RsBlas_ctrsm):
    583         initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
    584         cblas_ctrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
    585                     A, lda, B, ldb);
    586         break;
    587 
    588     case (RsBlas_zgemm):
    589         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
    590         cblas_zgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.z,
    591                     A, lda, B, ldb, (void*)&call->beta.z, C, ldc);
    592         break;
    593     case (RsBlas_zsymm):
    594         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
    595         cblas_zsymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A,
    596                     lda, B, ldb, (void*)&call->beta.z, C, ldc);
    597         break;
    598     case (RsBlas_zsyrk):
    599         initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
    600         cblas_zsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
    601                     lda, (void*)&call->beta.z, C, ldc);
    602         break;
    603     case (RsBlas_zsyr2k):
    604         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
    605         cblas_zsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
    606                      lda, B, ldb, (void*)&call->beta.z, C, ldc);
    607         break;
    608     case (RsBlas_ztrmm):
    609         initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
    610         cblas_ztrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
    611                     A, lda, B, ldb);
    612         break;
    613     case (RsBlas_ztrsm):
    614         initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
    615         cblas_ztrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
    616                     A, lda, B, ldb);
    617         break;
    618 
    619     // Level 3 C and Z only
    620     case (RsBlas_chemm):
    621         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
    622         cblas_chemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda,
    623                     B, ldb, (void*)&call->beta.c, C, ldc);
    624         break;
    625     case (RsBlas_cherk):
    626         initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
    627         cblas_cherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, A, lda,
    628                     call->beta.f, C, ldc);
    629         break;
    630     case (RsBlas_cher2k):
    631         initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
    632         cblas_cher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda,
    633                      B, ldb, call->beta.f, C, ldc);
    634         break;
    635 
    636     case (RsBlas_zhemm):
    637         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
    638         cblas_zhemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda,
    639                     B, ldb, (void*)&call->beta.z, C, ldc);
    640         break;
    641     case (RsBlas_zherk):
    642         initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
    643         cblas_zherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, A, lda,
    644                     call->beta.d, C, ldc);
    645         break;
    646     case (RsBlas_zher2k):
    647         initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
    648         cblas_zher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda,
    649                      B, ldb, call->beta.d, C, ldc);
    650         break;
    651 
    652 
    653     case (RsBlas_bnnm):
    654         initABC(ain, sizeof(uint8_t), &A, &B, &C, &lda, &ldb, &ldc);
    655         kernelBNNM(call->M, call->N, call->K,
    656                     (const uint8_t*)A, call->a_offset, lda,
    657                     (const uint8_t*)B, call->b_offset, ldb,
    658                     (uint8_t*)C, call->c_offset, ldc,
    659                     call->c_mult_int);
    660 
    661         break;
    662 
    663     default:
    664         ALOGE("unimplemented\n");
    665     }
    666 
    667 
    668 }
    669 
    670 void RsdCpuScriptIntrinsicBLAS::kernelBNNM(size_t m, size_t n, size_t k,
    671                                            const uint8_t* a, uint8_t a_offset, size_t lda,
    672                                            const uint8_t* b, uint8_t b_offset, size_t ldb,
    673                                            uint8_t* c, int32_t c_offset, size_t ldc,
    674                                            int32_t c_mult_int) {
    675     const int c_shift = 21;
    676 #if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
    677     // Non-optimized path for ARMv7 devices without SIMD instructions.
    678     if (!gArchUseSIMD) {
    679         /*
    680          * Calculations are done in 1.10.21 fixed-point format for the final output,
    681          * just before there's a shift down to drop the fractional parts. The output
    682          * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
    683          * gives some headroom to avoid wrapping around on small overflows.
    684          */
    685         size_t i = 0, j = 0, l = 0;
    686         for (j = 0; j < n; j++) {
    687             for (i = 0; i < m; i++) {
    688                 int32_t total = 0;
    689                 for (l = 0; l < k; l++) {
    690                     const int a_index = ((i * lda) + l);
    691                     const uint8_t a_as_byte = a[a_index];
    692                     const int32_t a_as_int = (((int32_t)(a_as_byte)) - a_offset);
    693                     const int b_index = ((j * ldb) + l);
    694                     const uint8_t b_as_byte = b[b_index];
    695                     const int32_t b_as_int = (((int32_t)(b_as_byte)) - b_offset);
    696                     const int32_t mult_as_int = (a_as_int * b_as_int);
    697                     total += mult_as_int;
    698                 }
    699                 const int c_index = ((ldc * i) + j);
    700                 int32_t output =
    701                     ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1)))
    702                      >> c_shift);
    703                 if (output > 255) {
    704                     output = 255;
    705                 }
    706                 if (output < 0) {
    707                     output = 0;
    708                 }
    709                 c[c_index] = (uint8_t)(output);
    710             }
    711         }
    712         return;
    713     }
    714 #endif
    715 
    716     // Using gemmlowp to calculate the low precision 8 bit GEMM.
    717     bool transpose_a = true;
    718     bool transpose_b = false;
    719     bool transpose_c = true;
    720     gemmlowp::eight_bit_int_gemm::EightBitIntGemm(transpose_a, transpose_b, transpose_c,
    721                                                   m, n, k, a, -a_offset, lda,
    722                                                   b, -b_offset, ldb, c, c_offset,
    723                                                   c_mult_int, c_shift, ldc,
    724                                                   gemmlowp::eight_bit_int_gemm::BitDepthSetting::A8B8);
    725 
    726 }
    727 
    728 
    729 
    730 
    731 
    732 RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
    733                                                    const Script *s)
    734             : RsdCpuScriptIntrinsic(ctx, s, nullptr, RS_SCRIPT_INTRINSIC_ID_BLAS) {
    735 
    736 
    737 }
    738 
    739 RsdCpuScriptIntrinsicBLAS::~RsdCpuScriptIntrinsicBLAS() {
    740 }
    741 
    742 
    743 
    744 
    745 
    746 RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
    747                                     const Script *s, const Element *e) {
    748 
    749     return new RsdCpuScriptIntrinsicBLAS(ctx, s);
    750 }
    751