Home | History | Annotate | Download | only in renderscript
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package android.renderscript;
     18 
     19 import android.annotation.IntDef;
     20 import java.lang.annotation.Retention;
     21 import java.lang.annotation.RetentionPolicy;
     22 
     23 /**
     24  *
     25  * ScriptIntrinsicBLAS class provides high performance RenderScript APIs to BLAS.
     26  *
     27  * The BLAS (Basic Linear Algebra Subprograms) are routines that provide standard
     28  * building blocks for performing basic vector and matrix operations.
     29  *
     30  * For detailed description of BLAS, please refer to http://www.netlib.org/blas/
     31  *
     32  **/
     33 public final class ScriptIntrinsicBLAS extends ScriptIntrinsic {
     34     private Allocation mLUT;
     35 
     36     private ScriptIntrinsicBLAS(long id, RenderScript rs) {
     37         super(id, rs);
     38     }
     39 
     40     private static final int RsBlas_sdsdot = 1;
     41     private static final int RsBlas_dsdot = 2;
     42     private static final int RsBlas_sdot = 3;
     43     private static final int RsBlas_ddot = 4;
     44     private static final int RsBlas_cdotu_sub = 5;
     45     private static final int RsBlas_cdotc_sub = 6;
     46     private static final int RsBlas_zdotu_sub = 7;
     47     private static final int RsBlas_zdotc_sub = 8;
     48     private static final int RsBlas_snrm2 = 9;
     49     private static final int RsBlas_sasum = 10;
     50     private static final int RsBlas_dnrm2 = 11;
     51     private static final int RsBlas_dasum = 12;
     52     private static final int RsBlas_scnrm2 = 13;
     53     private static final int RsBlas_scasum = 14;
     54     private static final int RsBlas_dznrm2 = 15;
     55     private static final int RsBlas_dzasum = 16;
     56     private static final int RsBlas_isamax = 17;
     57     private static final int RsBlas_idamax = 18;
     58     private static final int RsBlas_icamax = 19;
     59     private static final int RsBlas_izamax = 20;
     60     private static final int RsBlas_sswap = 21;
     61     private static final int RsBlas_scopy = 22;
     62     private static final int RsBlas_saxpy = 23;
     63     private static final int RsBlas_dswap = 24;
     64     private static final int RsBlas_dcopy = 25;
     65     private static final int RsBlas_daxpy = 26;
     66     private static final int RsBlas_cswap = 27;
     67     private static final int RsBlas_ccopy = 28;
     68     private static final int RsBlas_caxpy = 29;
     69     private static final int RsBlas_zswap = 30;
     70     private static final int RsBlas_zcopy = 31;
     71     private static final int RsBlas_zaxpy = 32;
     72     private static final int RsBlas_srotg = 33;
     73     private static final int RsBlas_srotmg = 34;
     74     private static final int RsBlas_srot = 35;
     75     private static final int RsBlas_srotm = 36;
     76     private static final int RsBlas_drotg = 37;
     77     private static final int RsBlas_drotmg = 38;
     78     private static final int RsBlas_drot = 39;
     79     private static final int RsBlas_drotm = 40;
     80     private static final int RsBlas_sscal = 41;
     81     private static final int RsBlas_dscal = 42;
     82     private static final int RsBlas_cscal = 43;
     83     private static final int RsBlas_zscal = 44;
     84     private static final int RsBlas_csscal = 45;
     85     private static final int RsBlas_zdscal = 46;
     86     private static final int RsBlas_sgemv = 47;
     87     private static final int RsBlas_sgbmv = 48;
     88     private static final int RsBlas_strmv = 49;
     89     private static final int RsBlas_stbmv = 50;
     90     private static final int RsBlas_stpmv = 51;
     91     private static final int RsBlas_strsv = 52;
     92     private static final int RsBlas_stbsv = 53;
     93     private static final int RsBlas_stpsv = 54;
     94     private static final int RsBlas_dgemv = 55;
     95     private static final int RsBlas_dgbmv = 56;
     96     private static final int RsBlas_dtrmv = 57;
     97     private static final int RsBlas_dtbmv = 58;
     98     private static final int RsBlas_dtpmv = 59;
     99     private static final int RsBlas_dtrsv = 60;
    100     private static final int RsBlas_dtbsv = 61;
    101     private static final int RsBlas_dtpsv = 62;
    102     private static final int RsBlas_cgemv = 63;
    103     private static final int RsBlas_cgbmv = 64;
    104     private static final int RsBlas_ctrmv = 65;
    105     private static final int RsBlas_ctbmv = 66;
    106     private static final int RsBlas_ctpmv = 67;
    107     private static final int RsBlas_ctrsv = 68;
    108     private static final int RsBlas_ctbsv = 69;
    109     private static final int RsBlas_ctpsv = 70;
    110     private static final int RsBlas_zgemv = 71;
    111     private static final int RsBlas_zgbmv = 72;
    112     private static final int RsBlas_ztrmv = 73;
    113     private static final int RsBlas_ztbmv = 74;
    114     private static final int RsBlas_ztpmv = 75;
    115     private static final int RsBlas_ztrsv = 76;
    116     private static final int RsBlas_ztbsv = 77;
    117     private static final int RsBlas_ztpsv = 78;
    118     private static final int RsBlas_ssymv = 79;
    119     private static final int RsBlas_ssbmv = 80;
    120     private static final int RsBlas_sspmv = 81;
    121     private static final int RsBlas_sger = 82;
    122     private static final int RsBlas_ssyr = 83;
    123     private static final int RsBlas_sspr = 84;
    124     private static final int RsBlas_ssyr2 = 85;
    125     private static final int RsBlas_sspr2 = 86;
    126     private static final int RsBlas_dsymv = 87;
    127     private static final int RsBlas_dsbmv = 88;
    128     private static final int RsBlas_dspmv = 89;
    129     private static final int RsBlas_dger = 90;
    130     private static final int RsBlas_dsyr = 91;
    131     private static final int RsBlas_dspr = 92;
    132     private static final int RsBlas_dsyr2 = 93;
    133     private static final int RsBlas_dspr2 = 94;
    134     private static final int RsBlas_chemv = 95;
    135     private static final int RsBlas_chbmv = 96;
    136     private static final int RsBlas_chpmv = 97;
    137     private static final int RsBlas_cgeru = 98;
    138     private static final int RsBlas_cgerc = 99;
    139     private static final int RsBlas_cher = 100;
    140     private static final int RsBlas_chpr = 101;
    141     private static final int RsBlas_cher2 = 102;
    142     private static final int RsBlas_chpr2 = 103;
    143     private static final int RsBlas_zhemv = 104;
    144     private static final int RsBlas_zhbmv = 105;
    145     private static final int RsBlas_zhpmv = 106;
    146     private static final int RsBlas_zgeru = 107;
    147     private static final int RsBlas_zgerc = 108;
    148     private static final int RsBlas_zher = 109;
    149     private static final int RsBlas_zhpr = 110;
    150     private static final int RsBlas_zher2 = 111;
    151     private static final int RsBlas_zhpr2 = 112;
    152     private static final int RsBlas_sgemm = 113;
    153     private static final int RsBlas_ssymm = 114;
    154     private static final int RsBlas_ssyrk = 115;
    155     private static final int RsBlas_ssyr2k = 116;
    156     private static final int RsBlas_strmm = 117;
    157     private static final int RsBlas_strsm = 118;
    158     private static final int RsBlas_dgemm = 119;
    159     private static final int RsBlas_dsymm = 120;
    160     private static final int RsBlas_dsyrk = 121;
    161     private static final int RsBlas_dsyr2k = 122;
    162     private static final int RsBlas_dtrmm = 123;
    163     private static final int RsBlas_dtrsm = 124;
    164     private static final int RsBlas_cgemm = 125;
    165     private static final int RsBlas_csymm = 126;
    166     private static final int RsBlas_csyrk = 127;
    167     private static final int RsBlas_csyr2k = 128;
    168     private static final int RsBlas_ctrmm = 129;
    169     private static final int RsBlas_ctrsm = 130;
    170     private static final int RsBlas_zgemm = 131;
    171     private static final int RsBlas_zsymm = 132;
    172     private static final int RsBlas_zsyrk = 133;
    173     private static final int RsBlas_zsyr2k = 134;
    174     private static final int RsBlas_ztrmm = 135;
    175     private static final int RsBlas_ztrsm = 136;
    176     private static final int RsBlas_chemm = 137;
    177     private static final int RsBlas_cherk = 138;
    178     private static final int RsBlas_cher2k = 139;
    179     private static final int RsBlas_zhemm = 140;
    180     private static final int RsBlas_zherk = 141;
    181     private static final int RsBlas_zher2k = 142;
    182 
    183     // BLAS extensions start here
    184     private static final int RsBlas_bnnm = 1000;
    185 
    186     /**
    187      * Create an intrinsic to access BLAS subroutines.
    188      *
    189      * @param rs The RenderScript context
    190      * @return ScriptIntrinsicBLAS
    191      */
    192     public static ScriptIntrinsicBLAS create(RenderScript rs) {
    193         long id = rs.nScriptIntrinsicCreate(13, Element.U32(rs).getID(rs));
    194         return new ScriptIntrinsicBLAS(id, rs);
    195     }
    196 
    197     /**
    198      * @hide
    199      */
    200     @IntDef({NO_TRANSPOSE, TRANSPOSE, CONJ_TRANSPOSE})
    201     @Retention(RetentionPolicy.SOURCE)
    202     public @interface Transpose {}
    203 
    204     /**
    205      * @hide
    206      */
    207     @IntDef({UPPER, LOWER})
    208     @Retention(RetentionPolicy.SOURCE)
    209     public @interface Uplo {}
    210 
    211     /**
    212      * @hide
    213      */
    214     @IntDef({NON_UNIT, UNIT})
    215     @Retention(RetentionPolicy.SOURCE)
    216     public @interface Diag {}
    217 
    218     /**
    219      * @hide
    220      */
    221     @IntDef({LEFT, RIGHT})
    222     @Retention(RetentionPolicy.SOURCE)
    223     public @interface Side {}
    224 
    225     public static final int NO_TRANSPOSE = 111;
    226     public static final int TRANSPOSE = 112;
    227     public static final int CONJ_TRANSPOSE = 113;
    228 
    229     public static final int UPPER = 121;
    230     public static final int LOWER = 122;
    231 
    232     public static final int NON_UNIT = 131;
    233     public static final int UNIT = 132;
    234 
    235     public static final int LEFT = 141;
    236     public static final int RIGHT = 142;
    237 
    238     static void validateSide(@Side int Side) {
    239         if (Side != LEFT && Side != RIGHT) {
    240             throw new RSRuntimeException("Invalid side passed to BLAS");
    241         }
    242     }
    243 
    244     static void validateTranspose(@Transpose int Trans) {
    245         if (Trans != NO_TRANSPOSE && Trans != TRANSPOSE &&
    246             Trans != CONJ_TRANSPOSE) {
    247             throw new RSRuntimeException("Invalid transpose passed to BLAS");
    248         }
    249     }
    250 
    251     static void validateConjTranspose(@Transpose int Trans) {
    252         if (Trans != NO_TRANSPOSE &&
    253             Trans != CONJ_TRANSPOSE) {
    254             throw new RSRuntimeException("Invalid transpose passed to BLAS");
    255         }
    256     }
    257 
    258     static void validateDiag(@Diag int Diag) {
    259         if (Diag != NON_UNIT && Diag != UNIT) {
    260             throw new RSRuntimeException("Invalid diag passed to BLAS");
    261         }
    262     }
    263 
    264     static void validateUplo(@Uplo int Uplo) {
    265         if (Uplo != UPPER && Uplo != LOWER) {
    266             throw new RSRuntimeException("Invalid uplo passed to BLAS");
    267         }
    268     }
    269 
    270 
    271     /**
    272      * Level 2 BLAS
    273      */
    274 
    275     static void validateGEMV(Element e, int TransA, Allocation A, Allocation X, int incX, Allocation Y, int incY) {
    276         validateTranspose(TransA);
    277         int M = A.getType().getY();
    278         int N = A.getType().getX();
    279         if (!A.getType().getElement().isCompatible(e) ||
    280             !X.getType().getElement().isCompatible(e) ||
    281             !Y.getType().getElement().isCompatible(e)) {
    282             throw new RSRuntimeException("Called BLAS with wrong Element type");
    283         }
    284         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
    285             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
    286         }
    287 
    288         if (incX <= 0 || incY <= 0) {
    289             throw new RSRuntimeException("Vector increments must be greater than 0");
    290         }
    291         int expectedXDim = -1, expectedYDim = -1;
    292         if (TransA == NO_TRANSPOSE) {
    293             expectedXDim = 1 + (N - 1) * incX;
    294             expectedYDim = 1 + (M - 1) * incY;
    295         } else {
    296             expectedXDim = 1 + (M - 1) * incX;
    297             expectedYDim = 1 + (N - 1) * incY;
    298         }
    299         if (X.getType().getX() != expectedXDim ||
    300             Y.getType().getX() != expectedYDim) {
    301             throw new RSRuntimeException("Incorrect vector dimensions for GEMV");
    302         }
    303     }
    304 
    305     /**
    306      * SGEMV performs one of the matrix-vector operations
    307      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
    308      *
    309      * Details: http://www.netlib.org/lapack/explore-html/db/d58/sgemv_8f.html
    310      *
    311      * @param TransA The type of transpose applied to matrix A.
    312      * @param alpha The scalar alpha.
    313      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
    314      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    315      * @param incX The increment for the elements of vector x, must be larger than zero.
    316      * @param beta The scalar beta.
    317      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
    318      * @param incY The increment for the elements of vector y, must be larger than zero.
    319      */
    320     public void SGEMV(@Transpose int TransA, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
    321         validateGEMV(Element.F32(mRS), TransA, A, X, incX, Y, incY);
    322         int M = A.getType().getY();
    323         int N = A.getType().getX();
    324         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
    325     }
    326 
    327     /**
    328      * DGEMV performs one of the matrix-vector operations
    329      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
    330      *
    331      * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dgemv_8f.html
    332      *
    333      * @param TransA The type of transpose applied to matrix A.
    334      * @param alpha The scalar alpha.
    335      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
    336      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
    337      * @param incX The increment for the elements of vector x, must be larger than zero.
    338      * @param beta The scalar beta.
    339      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
    340      * @param incY The increment for the elements of vector y, must be larger than zero.
    341      */
    342     public void DGEMV(@Transpose int TransA, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
    343         validateGEMV(Element.F64(mRS), TransA, A, X, incX, Y, incY);
    344         int M = A.getType().getY();
    345         int N = A.getType().getX();
    346         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
    347     }
    348 
    349     /**
    350      * CGEMV performs one of the matrix-vector operations
    351      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
    352      *
    353      * Details: http://www.netlib.org/lapack/explore-html/d4/d8a/cgemv_8f.html
    354      *
    355      * @param TransA The type of transpose applied to matrix A.
    356      * @param alpha The scalar alpha.
    357      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
    358      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
    359      * @param incX The increment for the elements of vector x, must be larger than zero.
    360      * @param beta The scalar beta.
    361      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
    362      * @param incY The increment for the elements of vector y, must be larger than zero.
    363      */
    364     public void CGEMV(@Transpose int TransA, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
    365         validateGEMV(Element.F32_2(mRS), TransA, A, X, incX, Y, incY);
    366         int M = A.getType().getY();
    367         int N = A.getType().getX();
    368         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
    369     }
    370 
    371     /**
    372      * ZGEMV performs one of the matrix-vector operations
    373      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
    374      *
    375      * Details: http://www.netlib.org/lapack/explore-html/db/d40/zgemv_8f.html
    376      *
    377      * @param TransA The type of transpose applied to matrix A.
    378      * @param alpha The scalar alpha.
    379      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
    380      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
    381      * @param incX The increment for the elements of vector x, must be larger than zero.
    382      * @param beta The scalar beta.
    383      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
    384      * @param incY The increment for the elements of vector y, must be larger than zero.
    385      */
    386     public void ZGEMV(@Transpose int TransA, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
    387         validateGEMV(Element.F64_2(mRS), TransA, A, X, incX, Y, incY);
    388         int M = A.getType().getY();
    389         int N = A.getType().getX();
    390         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
    391     }
    392 
    393     /**
    394      * SGBMV performs one of the matrix-vector operations
    395      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
    396      *
    397      * Details: http://www.netlib.org/lapack/explore-html/d6/d46/sgbmv_8f.html
    398      *
    399      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
    400      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
    401      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
    402      *           for i in range(0, m):
    403      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
    404      *                  b[i, j-i+kl] = a[i, j]
    405      *
    406      * @param TransA The type of transpose applied to matrix A.
    407      * @param KL The number of sub-diagonals of the matrix A.
    408      * @param KU The number of super-diagonals of the matrix A.
    409      * @param alpha The scalar alpha.
    410      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F32}.
    411      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    412      * @param incX The increment for the elements of vector x, must be larger than zero.
    413      * @param beta The scalar beta.
    414      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
    415      * @param incY The increment for the elements of vector y, must be larger than zero.
    416      */
    417     public void SGBMV(@Transpose int TransA, int KL, int KU, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
    418         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
    419         validateGEMV(Element.F32(mRS), TransA, A, X, incX, Y, incY);
    420         if (KL < 0 || KU < 0) {
    421             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
    422         }
    423         int M = A.getType().getY();
    424         int N = A.getType().getX();
    425         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, KL, KU);
    426     }
    427 
    428     /**
    429      * DGBMV performs one of the matrix-vector operations
    430      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
    431      *
    432      * Details: http://www.netlib.org/lapack/explore-html/d2/d3f/dgbmv_8f.html
    433      *
    434      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
    435      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
    436      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
    437      *           for i in range(0, m):
    438      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
    439      *                  b[i, j-i+kl] = a[i, j]
    440      *
    441      * @param TransA The type of transpose applied to matrix A.
    442      * @param KL The number of sub-diagonals of the matrix A.
    443      * @param KU The number of super-diagonals of the matrix A.
    444      * @param alpha The scalar alpha.
    445      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F64}.
    446      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
    447      * @param incX The increment for the elements of vector x, must be larger than zero.
    448      * @param beta The scalar beta.
    449      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
    450      * @param incY The increment for the elements of vector y, must be larger than zero.
    451      */
    452     public void DGBMV(@Transpose int TransA, int KL, int KU, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
    453         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
    454         validateGEMV(Element.F64(mRS), TransA, A, X, incX, Y, incY);
    455         if (KL < 0 || KU < 0) {
    456             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
    457         }
    458         int M = A.getType().getY();
    459         int N = A.getType().getX();
    460         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, KL, KU);
    461     }
    462 
    463     /**
    464      * CGBMV performs one of the matrix-vector operations
    465      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
    466      *
    467      * Details: http://www.netlib.org/lapack/explore-html/d0/d75/cgbmv_8f.html
    468      *
    469      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
    470      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
    471      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
    472      *           for i in range(0, m):
    473      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
    474      *                  b[i, j-i+kl] = a[i, j]
    475      *
    476      * @param TransA The type of transpose applied to matrix A.
    477      * @param KL The number of sub-diagonals of the matrix A.
    478      * @param KU The number of super-diagonals of the matrix A.
    479      * @param alpha The scalar alpha.
    480      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F32_2}.
    481      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
    482      * @param incX The increment for the elements of vector x, must be larger than zero.
    483      * @param beta The scalar beta.
    484      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
    485      * @param incY The increment for the elements of vector y, must be larger than zero.
    486      */
    487     public void CGBMV(@Transpose int TransA, int KL, int KU, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
    488         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
    489         validateGEMV(Element.F32_2(mRS), TransA, A, X, incX, Y, incY);
    490         if (KL < 0 || KU < 0) {
    491             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
    492         }
    493         int M = A.getType().getY();
    494         int N = A.getType().getX();
    495         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, KL, KU);
    496     }
    497 
    498     /**
    499      * ZGBMV performs one of the matrix-vector operations
    500      * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
    501      *
    502      * Details: http://www.netlib.org/lapack/explore-html/d9/d46/zgbmv_8f.html
    503      *
    504      * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
    505      *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
    506      *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
    507      *           for i in range(0, m):
    508      *              for j in range(max(0, i-kl), min(i+ku+1, n)):
    509      *                  b[i, j-i+kl] = a[i, j]
    510      *
    511      * @param TransA The type of transpose applied to matrix A.
    512      * @param KL The number of sub-diagonals of the matrix A.
    513      * @param KU The number of super-diagonals of the matrix A.
    514      * @param alpha The scalar alpha.
    515      * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F64_2}.
    516      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
    517      * @param incX The increment for the elements of vector x, must be larger than zero.
    518      * @param beta The scalar beta.
    519      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
    520      * @param incY The increment for the elements of vector y, must be larger than zero.
    521      */
    522     public void ZGBMV(@Transpose int TransA, int KL, int KU, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
    523         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
    524         validateGEMV(Element.F64_2(mRS), TransA, A, X, incX, Y, incY);
    525         if (KL < 0 || KU < 0) {
    526             throw new RSRuntimeException("KL and KU must be greater than or equal to 0");
    527         }
    528         int M = A.getType().getY();
    529         int N = A.getType().getX();
    530         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, KL, KU);
    531     }
    532 
    533     static void validateTRMV(Element e, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
    534         validateTranspose(TransA);
    535         validateUplo(Uplo);
    536         validateDiag(Diag);
    537         int N = A.getType().getY();
    538         if (A.getType().getX() != N) {
    539             throw new RSRuntimeException("A must be a square matrix for TRMV");
    540         }
    541         if (!A.getType().getElement().isCompatible(e) ||
    542             !X.getType().getElement().isCompatible(e)) {
    543             throw new RSRuntimeException("Called BLAS with wrong Element type");
    544         }
    545         if (X.getType().getY() > 1) {
    546             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
    547         }
    548 
    549         if (incX <= 0) {
    550             throw new RSRuntimeException("Vector increments must be greater than 0");
    551         }
    552         int expectedXDim = 1 + (N - 1) * incX;
    553         if (X.getType().getX() != expectedXDim) {
    554             throw new RSRuntimeException("Incorrect vector dimensions for TRMV");
    555         }
    556     }
    557 
    558     static int validateTPMV(Element e, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation Ap, Allocation X, int incX) {
    559         validateTranspose(TransA);
    560         validateUplo(Uplo);
    561         validateDiag(Diag);
    562         if (!Ap.getType().getElement().isCompatible(e) ||
    563             !X.getType().getElement().isCompatible(e)) {
    564             throw new RSRuntimeException("Called BLAS with wrong Element type");
    565         }
    566         if (X.getType().getY() > 1) {
    567             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
    568         }
    569 
    570         if (Ap.getType().getY() > 1) {
    571             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
    572         }
    573 
    574         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
    575         //is it really doing anything?
    576         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
    577             throw new RSRuntimeException("Invalid dimension for Ap");
    578         }
    579         if (incX <= 0) {
    580             throw new RSRuntimeException("Vector increments must be greater than 0");
    581         }
    582         int expectedXDim = 1 + (N - 1) * incX;
    583         if (X.getType().getX() != expectedXDim) {
    584             throw new RSRuntimeException("Incorrect vector dimensions for TPMV");
    585         }
    586 
    587         return N;
    588     }
    589 
    590     /**
    591      * STRMV performs one of the matrix-vector operations
    592      * x := A*x   or   x := A**T*x
    593      *
    594      * Details: http://www.netlib.org/lapack/explore-html/de/d45/strmv_8f.html
    595      *
    596      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    597      * @param TransA The type of transpose applied to matrix A.
    598      * @param Diag Specifies whether or not A is unit triangular.
    599      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
    600      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    601      * @param incX The increment for the elements of vector x, must be larger than zero.
    602      */
    603     public void STRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
    604         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
    605         int N = A.getType().getY();
    606         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    607     }
    608 
    609     /**
    610      * DTRMV performs one of the matrix-vector operations
    611      * x := A*x   or   x := A**T*x
    612      *
    613      * Details: http://www.netlib.org/lapack/explore-html/dc/d7e/dtrmv_8f.html
    614      *
    615      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    616      * @param TransA The type of transpose applied to matrix A.
    617      * @param Diag Specifies whether or not A is unit triangular.
    618      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
    619      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
    620      * @param incX The increment for the elements of vector x, must be larger than zero.
    621      */
    622     public void DTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
    623         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
    624         int N = A.getType().getY();
    625         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    626     }
    627 
    628     /**
    629      * CTRMV performs one of the matrix-vector operations
    630      * x := A*x   or   x := A**T*x   or   x := A**H*x
    631      *
    632      * Details: http://www.netlib.org/lapack/explore-html/df/d78/ctrmv_8f.html
    633      *
    634      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    635      * @param TransA The type of transpose applied to matrix A.
    636      * @param Diag Specifies whether or not A is unit triangular.
    637      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
    638      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
    639      * @param incX The increment for the elements of vector x, must be larger than zero.
    640      */
    641     public void CTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
    642         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
    643         int N = A.getType().getY();
    644         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    645     }
    646 
    647     /**
    648      * ZTRMV performs one of the matrix-vector operations
    649      * x := A*x   or   x := A**T*x   or   x := A**H*x
    650      *
    651      * Details: http://www.netlib.org/lapack/explore-html/d0/dd1/ztrmv_8f.html
    652      *
    653      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    654      * @param TransA The type of transpose applied to matrix A.
    655      * @param Diag Specifies whether or not A is unit triangular.
    656      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
    657      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
    658      * @param incX The increment for the elements of vector x, must be larger than zero.
    659      */
    660     public void ZTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
    661         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
    662         int N = A.getType().getY();
    663         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    664     }
    665 
    666     /**
    667      * STBMV performs one of the matrix-vector operations
    668      * x := A*x   or   x := A**T*x
    669      *
    670      * Details: http://www.netlib.org/lapack/explore-html/d6/d7d/stbmv_8f.html
    671      *
    672      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
    673      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
    674      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
    675      *           for i in range(0, n):
    676      *              for j in range(i, min(i+k+1, n)):
    677      *                  b[i, j-i] = a[i, j]
    678      *
    679      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    680      * @param TransA The type of transpose applied to matrix A.
    681      * @param Diag Specifies whether or not A is unit triangular.
    682      * @param K The number of off-diagonals of the matrix A
    683      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
    684      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    685      * @param incX The increment for the elements of vector x, must be larger than zero.
    686      */
    687     public void STBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
    688         // TBMV has the same requirements as TRMV + K >= 0
    689         if (K < 0) {
    690             throw new RSRuntimeException("K must be greater than or equal to 0");
    691         }
    692         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
    693         int N = A.getType().getY();
    694         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    695     }
    696 
    697     /**
    698      * DTBMV performs one of the matrix-vector operations
    699      * x := A*x   or   x := A**T*x
    700      *
    701      * Details: http://www.netlib.org/lapack/explore-html/df/d29/dtbmv_8f.html
    702      *
    703      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
    704      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
    705      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
    706      *           for i in range(0, n):
    707      *              for j in range(i, min(i+k+1, n)):
    708      *                  b[i, j-i] = a[i, j]
    709      *
    710      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    711      * @param TransA The type of transpose applied to matrix A.
    712      * @param Diag Specifies whether or not A is unit triangular.
    713      * @param K The number of off-diagonals of the matrix A
    714      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
    715      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
    716      * @param incX The increment for the elements of vector x, must be larger than zero.
    717      */
    718     public void DTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
    719         // TBMV has the same requirements as TRMV + K >= 0
    720         if (K < 0) {
    721             throw new RSRuntimeException("K must be greater than or equal to 0");
    722         }
    723         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
    724         int N = A.getType().getY();
    725         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    726     }
    727 
    728     /**
    729      * CTBMV performs one of the matrix-vector operations
    730      * x := A*x   or   x := A**T*x   or   x := A**H*x
    731      *
    732      * Details: http://www.netlib.org/lapack/explore-html/d3/dcd/ctbmv_8f.html
    733      *
    734      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
    735      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
    736      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
    737      *           for i in range(0, n):
    738      *              for j in range(i, min(i+k+1, n)):
    739      *                  b[i, j-i] = a[i, j]
    740      *
    741      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    742      * @param TransA The type of transpose applied to matrix A.
    743      * @param Diag Specifies whether or not A is unit triangular.
    744      * @param K The number of off-diagonals of the matrix A
    745      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
    746      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
    747      * @param incX The increment for the elements of vector x, must be larger than zero.
    748      */
    749     public void CTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
    750         // TBMV has the same requirements as TRMV + K >= 0
    751         if (K < 0) {
    752             throw new RSRuntimeException("K must be greater than or equal to 0");
    753         }
    754         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
    755         int N = A.getType().getY();
    756         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    757     }
    758 
    759     /**
    760      * ZTBMV performs one of the matrix-vector operations
    761      * x := A*x   or   x := A**T*x   or   x := A**H*x
    762      *
    763      * Details: http://www.netlib.org/lapack/explore-html/d3/d39/ztbmv_8f.html
    764      *
    765      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
    766      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
    767      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
    768      *           for i in range(0, n):
    769      *              for j in range(i, min(i+k+1, n)):
    770      *                  b[i, j-i] = a[i, j]
    771      *
    772      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    773      * @param TransA The type of transpose applied to matrix A.
    774      * @param Diag Specifies whether or not A is unit triangular.
    775      * @param K The number of off-diagonals of the matrix A
    776      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
    777      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
    778      * @param incX The increment for the elements of vector x, must be larger than zero.
    779      */
    780     public void ZTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
    781         // TBMV has the same requirements as TRMV + K >= 0
    782         if (K < 0) {
    783             throw new RSRuntimeException("K must be greater than or equal to 0");
    784         }
    785         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
    786         int N = A.getType().getY();
    787         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    788     }
    789 
    790     /**
    791      * STPMV performs one of the matrix-vector operations
    792      * x := A*x   or   x := A**T*x
    793      *
    794      * Details: http://www.netlib.org/lapack/explore-html/db/db1/stpmv_8f.html
    795      *
    796      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
    797      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
    798      *       'a' to packed matrix 'b'.
    799      *           k = 0
    800      *           for i in range(0, n):
    801      *              for j in range(i, n):
    802      *                  b[k++] = a[i, j]
    803      *
    804      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    805      * @param TransA The type of transpose applied to matrix A.
    806      * @param Diag Specifies whether or not A is unit triangular.
    807      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32}.
    808      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    809      * @param incX The increment for the elements of vector x, must be larger than zero.
    810      */
    811     public void STPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
    812         int N = validateTPMV(Element.F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
    813         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    814     }
    815 
    816     /**
    817      * DTPMV performs one of the matrix-vector operations
    818      * x := A*x   or   x := A**T*x
    819      *
    820      * Details: http://www.netlib.org/lapack/explore-html/dc/dcd/dtpmv_8f.html
    821      *
    822      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
    823      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
    824      *       'a' to packed matrix 'b'.
    825      *           k = 0
    826      *           for i in range(0, n):
    827      *              for j in range(i, n):
    828      *                  b[k++] = a[i, j]
    829      *
    830      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    831      * @param TransA The type of transpose applied to matrix A.
    832      * @param Diag Specifies whether or not A is unit triangular.
    833      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64}.
    834      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
    835      * @param incX The increment for the elements of vector x, must be larger than zero.
    836      */
    837     public void DTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
    838         int N = validateTPMV(Element.F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
    839         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    840     }
    841 
    842     /**
    843      * CTPMV performs one of the matrix-vector operations
    844      * x := A*x   or   x := A**T*x   or   x := A**H*x
    845      *
    846      * Details: http://www.netlib.org/lapack/explore-html/d4/dbb/ctpmv_8f.html
    847      *
    848      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
    849      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
    850      *       'a' to packed matrix 'b'.
    851      *           k = 0
    852      *           for i in range(0, n):
    853      *              for j in range(i, n):
    854      *                  b[k++] = a[i, j]
    855      *
    856      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    857      * @param TransA The type of transpose applied to matrix A.
    858      * @param Diag Specifies whether or not A is unit triangular.
    859      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32_2}.
    860      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
    861      * @param incX The increment for the elements of vector x, must be larger than zero.
    862      */
    863     public void CTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
    864         int N = validateTPMV(Element.F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
    865         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    866     }
    867 
    868     /**
    869      * ZTPMV performs one of the matrix-vector operations
    870      * x := A*x   or   x := A**T*x   or   x := A**H*x
    871      *
    872      * Details: http://www.netlib.org/lapack/explore-html/d2/d9e/ztpmv_8f.html
    873      *
    874      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
    875      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
    876      *       'a' to packed matrix 'b'.
    877      *           k = 0
    878      *           for i in range(0, n):
    879      *              for j in range(i, n):
    880      *                  b[k++] = a[i, j]
    881      *
    882      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    883      * @param TransA The type of transpose applied to matrix A.
    884      * @param Diag Specifies whether or not A is unit triangular.
    885      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64_2}.
    886      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
    887      * @param incX The increment for the elements of vector x, must be larger than zero.
    888      */
    889     public void ZTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
    890         int N = validateTPMV(Element.F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
    891         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    892     }
    893 
    894     /**
    895      * STRSV solves one of the systems of equations
    896      * A*x = b   or   A**T*x = b
    897      *
    898      * Details: http://www.netlib.org/lapack/explore-html/d0/d2a/strsv_8f.html
    899      *
    900      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    901      * @param TransA The type of transpose applied to matrix A.
    902      * @param Diag Specifies whether or not A is unit triangular.
    903      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
    904      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    905      * @param incX The increment for the elements of vector x, must be larger than zero.
    906      */
    907     public void STRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
    908         // TRSV is the same as TRMV
    909         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
    910         int N = A.getType().getY();
    911         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    912 
    913     }
    914 
    915     /**
    916      * DTRSV solves one of the systems of equations
    917      * A*x = b   or   A**T*x = b
    918      *
    919      * Details: http://www.netlib.org/lapack/explore-html/d6/d96/dtrsv_8f.html
    920      *
    921      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    922      * @param TransA The type of transpose applied to matrix A.
    923      * @param Diag Specifies whether or not A is unit triangular.
    924      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
    925      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
    926      * @param incX The increment for the elements of vector x, must be larger than zero.
    927      */
    928     public void DTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
    929         // TRSV is the same as TRMV
    930         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
    931         int N = A.getType().getY();
    932         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
    933 
    934     }
    935 
    936     /**
    937      * CTRSV solves one of the systems of equations
    938      * A*x = b   or   A**T*x = b   or   A**H*x = b
    939      *
    940      * Details: http://www.netlib.org/lapack/explore-html/d4/dc8/ctrsv_8f.html
    941      *
    942      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    943      * @param TransA The type of transpose applied to matrix A.
    944      * @param Diag Specifies whether or not A is unit triangular.
    945      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
    946      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
    947      * @param incX The increment for the elements of vector x, must be larger than zero.
    948      */
    949     public void CTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
    950         // TRSV is the same as TRMV
    951         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
    952         int N = A.getType().getY();
    953         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    954 
    955     }
    956 
    957     /**
    958      * ZTRSV solves one of the systems of equations
    959      * A*x = b   or   A**T*x = b   or   A**H*x = b
    960      *
    961      * Details: http://www.netlib.org/lapack/explore-html/d1/d2f/ztrsv_8f.html
    962      *
    963      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    964      * @param TransA The type of transpose applied to matrix A.
    965      * @param Diag Specifies whether or not A is unit triangular.
    966      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
    967      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
    968      * @param incX The increment for the elements of vector x, must be larger than zero.
    969      */
    970     public void ZTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
    971         // TRSV is the same as TRMV
    972         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
    973         int N = A.getType().getY();
    974         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
    975 
    976     }
    977 
    978     /**
    979      * STBSV solves one of the systems of equations
    980      * A*x = b   or   A**T*x = b
    981      *
    982      * Details: http://www.netlib.org/lapack/explore-html/d0/d1f/stbsv_8f.html
    983      *
    984      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
    985      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
    986      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
    987      *           for i in range(0, n):
    988      *              for j in range(i, min(i+k+1, n)):
    989      *                  b[i, j-i] = a[i, j]
    990      *
    991      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
    992      * @param TransA The type of transpose applied to matrix A.
    993      * @param Diag Specifies whether or not A is unit triangular.
    994      * @param K The number of off-diagonals of the matrix A
    995      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
    996      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
    997      * @param incX The increment for the elements of vector x, must be larger than zero.
    998      */
    999     public void STBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
   1000         // TBSV is the same as TRMV + K >= 0
   1001         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
   1002         int N = A.getType().getY();
   1003         if (K < 0) {
   1004             throw new RSRuntimeException("Number of diagonals must be positive");
   1005         }
   1006         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
   1007     }
   1008 
   1009     /**
   1010      * DTBSV solves one of the systems of equations
   1011      * A*x = b   or   A**T*x = b
   1012      *
   1013      * Details: http://www.netlib.org/lapack/explore-html/d4/dcf/dtbsv_8f.html
   1014      *
   1015      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   1016      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   1017      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   1018      *           for i in range(0, n):
   1019      *              for j in range(i, min(i+k+1, n)):
   1020      *                  b[i, j-i] = a[i, j]
   1021      *
   1022      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1023      * @param TransA The type of transpose applied to matrix A.
   1024      * @param Diag Specifies whether or not A is unit triangular.
   1025      * @param K The number of off-diagonals of the matrix A
   1026      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1027      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1028      * @param incX The increment for the elements of vector x, must be larger than zero.
   1029      */
   1030     public void DTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
   1031         // TBSV is the same as TRMV + K >= 0
   1032         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
   1033         int N = A.getType().getY();
   1034         if (K < 0) {
   1035             throw new RSRuntimeException("Number of diagonals must be positive");
   1036         }
   1037         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, A.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
   1038     }
   1039 
   1040     /**
   1041      * CTBSV solves one of the systems of equations
   1042      * A*x = b   or   A**T*x = b   or   A**H*x = b
   1043      *
   1044      * Details: http://www.netlib.org/lapack/explore-html/d9/d5f/ctbsv_8f.html
   1045      *
   1046      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   1047      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   1048      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   1049      *           for i in range(0, n):
   1050      *              for j in range(i, min(i+k+1, n)):
   1051      *                  b[i, j-i] = a[i, j]
   1052      *
   1053      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1054      * @param TransA The type of transpose applied to matrix A.
   1055      * @param Diag Specifies whether or not A is unit triangular.
   1056      * @param K The number of off-diagonals of the matrix A
   1057      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1058      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1059      * @param incX The increment for the elements of vector x, must be larger than zero.
   1060      */
   1061     public void CTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
   1062         // TBSV is the same as TRMV + K >= 0
   1063         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
   1064         int N = A.getType().getY();
   1065         if (K < 0) {
   1066             throw new RSRuntimeException("Number of diagonals must be positive");
   1067         }
   1068         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
   1069     }
   1070 
   1071     /**
   1072      * ZTBSV solves one of the systems of equations
   1073      * A*x = b   or   A**T*x = b   or   A**H*x = b
   1074      *
   1075      * Details: http://www.netlib.org/lapack/explore-html/d4/d5a/ztbsv_8f.html
   1076      *
   1077      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   1078      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   1079      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   1080      *           for i in range(0, n):
   1081      *              for j in range(i, min(i+k+1, n)):
   1082      *                  b[i, j-i] = a[i, j]
   1083      *
   1084      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1085      * @param TransA The type of transpose applied to matrix A.
   1086      * @param Diag Specifies whether or not A is unit triangular.
   1087      * @param K The number of off-diagonals of the matrix A
   1088      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   1089      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   1090      * @param incX The increment for the elements of vector x, must be larger than zero.
   1091      */
   1092     public void ZTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
   1093         // TBSV is the same as TRMV + K >= 0
   1094         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
   1095         int N = A.getType().getY();
   1096         if (K < 0) {
   1097             throw new RSRuntimeException("Number of diagonals must be positive");
   1098         }
   1099         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, A.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
   1100     }
   1101 
   1102     /**
   1103      * STPSV solves one of the systems of equations
   1104      * A*x = b   or   A**T*x = b
   1105      *
   1106      * Details: http://www.netlib.org/lapack/explore-html/d0/d7c/stpsv_8f.html
   1107      *
   1108      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1109      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1110      *       'a' to packed matrix 'b'.
   1111      *           k = 0
   1112      *           for i in range(0, n):
   1113      *              for j in range(i, n):
   1114      *                  b[k++] = a[i, j]
   1115      *
   1116      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1117      * @param TransA The type of transpose applied to matrix A.
   1118      * @param Diag Specifies whether or not A is unit triangular.
   1119      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32}.
   1120      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1121      * @param incX The increment for the elements of vector x, must be larger than zero.
   1122      */
   1123     public void STPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
   1124         // TPSV is same as TPMV
   1125         int N = validateTPMV(Element.F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
   1126         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
   1127     }
   1128 
   1129     /**
   1130      * DTPSV solves one of the systems of equations
   1131      * A*x = b   or   A**T*x = b
   1132      *
   1133      * Details: http://www.netlib.org/lapack/explore-html/d9/d84/dtpsv_8f.html
   1134      *
   1135      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1136      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1137      *       'a' to packed matrix 'b'.
   1138      *           k = 0
   1139      *           for i in range(0, n):
   1140      *              for j in range(i, n):
   1141      *                  b[k++] = a[i, j]
   1142      *
   1143      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1144      * @param TransA The type of transpose applied to matrix A.
   1145      * @param Diag Specifies whether or not A is unit triangular.
   1146      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64}.
   1147      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1148      * @param incX The increment for the elements of vector x, must be larger than zero.
   1149      */
   1150     public void DTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
   1151         // TPSV is same as TPMV
   1152         int N = validateTPMV(Element.F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
   1153         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, incX, 0, 0, 0);
   1154     }
   1155 
   1156     /**
   1157      * CTPSV solves one of the systems of equations
   1158      * A*x = b   or   A**T*x = b   or   A**H*x = b
   1159      *
   1160      * Details: http://www.netlib.org/lapack/explore-html/d8/d56/ctpsv_8f.html
   1161      *
   1162      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1163      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1164      *       'a' to packed matrix 'b'.
   1165      *           k = 0
   1166      *           for i in range(0, n):
   1167      *              for j in range(i, n):
   1168      *                  b[k++] = a[i, j]
   1169      *
   1170      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1171      * @param TransA The type of transpose applied to matrix A.
   1172      * @param Diag Specifies whether or not A is unit triangular.
   1173      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32_2}.
   1174      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1175      * @param incX The increment for the elements of vector x, must be larger than zero.
   1176      */
   1177     public void CTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
   1178         // TPSV is same as TPMV
   1179         int N = validateTPMV(Element.F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
   1180         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
   1181     }
   1182 
   1183     /**
   1184      * ZTPSV solves one of the systems of equations
   1185      * A*x = b   or   A**T*x = b   or   A**H*x = b
   1186      *
   1187      * Details: http://www.netlib.org/lapack/explore-html/da/d57/ztpsv_8f.html
   1188      *
   1189      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1190      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1191      *       'a' to packed matrix 'b'.
   1192      *           k = 0
   1193      *           for i in range(0, n):
   1194      *              for j in range(i, n):
   1195      *                  b[k++] = a[i, j]
   1196      *
   1197      * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
   1198      * @param TransA The type of transpose applied to matrix A.
   1199      * @param Diag Specifies whether or not A is unit triangular.
   1200      * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64_2}.
   1201      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   1202      * @param incX The increment for the elements of vector x, must be larger than zero.
   1203      */
   1204     public void ZTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
   1205         // TPSV is same as TPMV
   1206         int N = validateTPMV(Element.F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
   1207         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, Ap.getID(mRS), X.getID(mRS), 0, 0, 0, incX, 0, 0, 0);
   1208     }
   1209 
   1210     /**
   1211      * Level 2, S and D only
   1212      */
   1213     static int validateSYMV(Element e, @Uplo int Uplo, Allocation A, Allocation X, Allocation Y, int incX, int incY) {
   1214         validateUplo(Uplo);
   1215         int N = A.getType().getY();
   1216         if (A.getType().getX() != N) {
   1217             throw new RSRuntimeException("A must be a square matrix for SYMV");
   1218         }
   1219         if (!A.getType().getElement().isCompatible(e) ||
   1220             !X.getType().getElement().isCompatible(e) ||
   1221             !Y.getType().getElement().isCompatible(e) ) {
   1222             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1223         }
   1224         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
   1225             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1226         }
   1227 
   1228         if (incX <= 0 || incY <= 0) {
   1229             throw new RSRuntimeException("Vector increments must be greater than 0");
   1230         }
   1231         int expectedXDim = 1 + (N - 1) * incX;
   1232         if (X.getType().getX() != expectedXDim) {
   1233             throw new RSRuntimeException("Incorrect vector dimensions for SYMV");
   1234         }
   1235         int expectedYDim = 1 + (N - 1) * incY;
   1236         if (Y.getType().getX() != expectedYDim) {
   1237             throw new RSRuntimeException("Incorrect vector dimensions for SYMV");
   1238         }
   1239         return N;
   1240     }
   1241     static int validateSPMV(Element e, @Uplo int Uplo, Allocation Ap, Allocation X, int incX, Allocation Y, int incY) {
   1242         validateUplo(Uplo);
   1243         if (!Ap.getType().getElement().isCompatible(e) ||
   1244             !X.getType().getElement().isCompatible(e) ||
   1245             !Y.getType().getElement().isCompatible(e)) {
   1246             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1247         }
   1248         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
   1249             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1250         }
   1251 
   1252         if (Ap.getType().getY() > 1) {
   1253             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
   1254         }
   1255 
   1256         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
   1257         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
   1258             throw new RSRuntimeException("Invalid dimension for Ap");
   1259         }
   1260         if (incX <= 0 || incY <= 0) {
   1261             throw new RSRuntimeException("Vector increments must be greater than 0");
   1262         }
   1263         int expectedXDim = 1 + (N - 1) * incX;
   1264         if (X.getType().getX() != expectedXDim) {
   1265             throw new RSRuntimeException("Incorrect vector dimensions for SPMV");
   1266         }
   1267         int expectedYDim = 1 + (N - 1) * incY;
   1268         if (Y.getType().getX() != expectedYDim) {
   1269             throw new RSRuntimeException("Incorrect vector dimensions for SPMV");
   1270         }
   1271 
   1272         return N;
   1273     }
   1274     static void validateGER(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1275         if (!A.getType().getElement().isCompatible(e) ||
   1276             !X.getType().getElement().isCompatible(e) ||
   1277             !Y.getType().getElement().isCompatible(e) ) {
   1278             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1279         }
   1280 
   1281         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
   1282             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1283         }
   1284 
   1285         int M = A.getType().getY();
   1286         int N = A.getType().getX();
   1287 
   1288         if (N < 1 || M < 1) {
   1289             throw new RSRuntimeException("M and N must be 1 or greater for GER");
   1290         }
   1291         if (incX <= 0 || incY <= 0) {
   1292             throw new RSRuntimeException("Vector increments must be greater than 0");
   1293         }
   1294         int expectedXDim = 1 + (M - 1) * incX;
   1295         if (X.getType().getX() != expectedXDim) {
   1296             throw new RSRuntimeException("Incorrect vector dimensions for GER");
   1297         }
   1298         int expectedYDim = 1 + (N - 1) * incY;
   1299         if (Y.getType().getX() != expectedYDim) {
   1300             throw new RSRuntimeException("Incorrect vector dimensions for GER");
   1301         }
   1302 
   1303 
   1304     }
   1305     static int validateSYR(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation A) {
   1306         validateUplo(Uplo);
   1307         if (!A.getType().getElement().isCompatible(e) ||
   1308             !X.getType().getElement().isCompatible(e)) {
   1309             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1310         }
   1311 
   1312         int N = A.getType().getX();
   1313 
   1314         if (X.getType().getY() > 1) {
   1315             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1316         }
   1317         if (N != A.getType().getY()) {
   1318             throw new RSRuntimeException("A must be a symmetric matrix");
   1319         }
   1320         if (incX <= 0) {
   1321             throw new RSRuntimeException("Vector increments must be greater than 0");
   1322         }
   1323         int expectedXDim = 1 + (N - 1) * incX;
   1324         if (X.getType().getX() != expectedXDim) {
   1325             throw new RSRuntimeException("Incorrect vector dimensions for SYR");
   1326         }
   1327         return N;
   1328     }
   1329     static int validateSPR(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Ap) {
   1330         validateUplo(Uplo);
   1331         if (!Ap.getType().getElement().isCompatible(e) ||
   1332             !X.getType().getElement().isCompatible(e)) {
   1333             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1334         }
   1335         if (X.getType().getY() > 1) {
   1336             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1337         }
   1338 
   1339         if (Ap.getType().getY() > 1) {
   1340             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
   1341         }
   1342 
   1343         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
   1344         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
   1345             throw new RSRuntimeException("Invalid dimension for Ap");
   1346         }
   1347         if (incX <= 0) {
   1348             throw new RSRuntimeException("Vector increments must be greater than 0");
   1349         }
   1350         int expectedXDim = 1 + (N - 1) * incX;
   1351         if (X.getType().getX() != expectedXDim) {
   1352             throw new RSRuntimeException("Incorrect vector dimensions for SPR");
   1353         }
   1354 
   1355         return N;
   1356     }
   1357 
   1358     static int validateSYR2(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1359         validateUplo(Uplo);
   1360         if (!A.getType().getElement().isCompatible(e) ||
   1361             !X.getType().getElement().isCompatible(e) ||
   1362             !Y.getType().getElement().isCompatible(e)) {
   1363             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1364         }
   1365 
   1366         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
   1367             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1368         }
   1369 
   1370         int N = A.getType().getX();
   1371 
   1372         if (N != A.getType().getY()) {
   1373             throw new RSRuntimeException("A must be a symmetric matrix");
   1374         }
   1375         if (incX <= 0 || incY <= 0) {
   1376             throw new RSRuntimeException("Vector increments must be greater than 0");
   1377         }
   1378         int expectedXDim = 1 + (N - 1) * incX;
   1379         int expectedYDim = 1 + (N - 1) * incY;
   1380         if (X.getType().getX() != expectedXDim || Y.getType().getX() != expectedYDim) {
   1381             throw new RSRuntimeException("Incorrect vector dimensions for SYR");
   1382         }
   1383         return N;
   1384 
   1385     }
   1386     static int validateSPR2(Element e, @Uplo int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
   1387         validateUplo(Uplo);
   1388         if (!Ap.getType().getElement().isCompatible(e) ||
   1389             !X.getType().getElement().isCompatible(e) ||
   1390             !Y.getType().getElement().isCompatible(e)) {
   1391             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1392         }
   1393         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
   1394             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1395         }
   1396 
   1397         if (Ap.getType().getY() > 1) {
   1398             throw new RSRuntimeException("Ap must have a Y dimension of 0 or 1");
   1399         }
   1400 
   1401         int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
   1402         if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
   1403             throw new RSRuntimeException("Invalid dimension for Ap");
   1404         }
   1405         if (incX <= 0 || incY <= 0) {
   1406             throw new RSRuntimeException("Vector increments must be greater than 0");
   1407         }
   1408         int expectedXDim = 1 + (N - 1) * incX;
   1409         int expectedYDim = 1 + (N - 1) * incY;
   1410         if (X.getType().getX() != expectedXDim || Y.getType().getX() != expectedYDim) {
   1411             throw new RSRuntimeException("Incorrect vector dimensions for SPR2");
   1412         }
   1413 
   1414         return N;
   1415     }
   1416 
   1417     /**
   1418      * SSYMV performs the matrix-vector operation
   1419      * y := alpha*A*x + beta*y
   1420      *
   1421      * Details: http://www.netlib.org/lapack/explore-html/d2/d94/ssymv_8f.html
   1422      *
   1423      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1424      * @param alpha The scalar alpha.
   1425      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1426      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1427      * @param incX The increment for the elements of vector x, must be larger than zero.
   1428      * @param beta The scalar beta.
   1429      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
   1430      * @param incY The increment for the elements of vector y, must be larger than zero.
   1431      */
   1432     public void SSYMV(@Uplo int Uplo, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
   1433         int N = validateSYMV(Element.F32(mRS), Uplo, A, X, Y, incX, incY);
   1434         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssymv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
   1435     }
   1436 
   1437     /**
   1438      * SSBMV performs the matrix-vector operation
   1439      * y := alpha*A*x + beta*y
   1440      *
   1441      * Details: http://www.netlib.org/lapack/explore-html/d3/da1/ssbmv_8f.html
   1442      *
   1443      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   1444      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   1445      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   1446      *           for i in range(0, n):
   1447      *              for j in range(i, min(i+k+1, n)):
   1448      *                  b[i, j-i] = a[i, j]
   1449      *
   1450      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
   1451      * @param K The number of off-diagonals of the matrix A
   1452      * @param alpha The scalar alpha.
   1453      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1454      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1455      * @param incX The increment for the elements of vector x, must be larger than zero.
   1456      * @param beta The scalar beta.
   1457      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
   1458      * @param incY The increment for the elements of vector y, must be larger than zero.
   1459      */
   1460     public void SSBMV(@Uplo int Uplo, int K, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
   1461         // SBMV is the same as SYMV + K >= 0
   1462         if (K < 0) {
   1463             throw new RSRuntimeException("K must be greater than or equal to 0");
   1464         }
   1465         int N = validateSYMV(Element.F32(mRS), Uplo, A, X, Y, incX, incY);
   1466         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
   1467     }
   1468 
   1469     /**
   1470      * SSPMV performs the matrix-vector operation
   1471      * y := alpha*A*x + beta*y
   1472      *
   1473      * Details: http://www.netlib.org/lapack/explore-html/d8/d68/sspmv_8f.html
   1474      *
   1475      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1476      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1477      *       'a' to packed matrix 'b'.
   1478      *           k = 0
   1479      *           for i in range(0, n):
   1480      *              for j in range(i, n):
   1481      *                  b[k++] = a[i, j]
   1482      *
   1483      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
   1484      * @param alpha The scalar alpha.
   1485      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1486      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1487      * @param incX The increment for the elements of vector x, must be larger than zero.
   1488      * @param beta The scalar beta.
   1489      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
   1490      * @param incY The increment for the elements of vector y, must be larger than zero.
   1491      */
   1492     public void SSPMV(@Uplo int Uplo, float alpha, Allocation Ap, Allocation X, int incX, float beta, Allocation Y, int incY) {
   1493         int N = validateSPMV(Element.F32(mRS), Uplo, Ap, X, incX, Y, incY);
   1494         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, Ap.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
   1495     }
   1496 
   1497     /**
   1498      * SGER performs the rank 1 operation
   1499      * A := alpha*x*y**T + A
   1500      *
   1501      * Details: http://www.netlib.org/lapack/explore-html/db/d5c/sger_8f.html
   1502      *
   1503      * @param alpha The scalar alpha.
   1504      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1505      * @param incX The increment for the elements of vector x, must be larger than zero.
   1506      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
   1507      * @param incY The increment for the elements of vector y, must be larger than zero.
   1508      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1509      */
   1510     public void SGER(float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1511         int M = A.getType().getY();
   1512         int N = A.getType().getX();
   1513         validateGER(Element.F32(mRS), X, incX, Y, incY, A);
   1514         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sger, 0, 0, 0, 0, 0, M, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0.f, A.getID(mRS), incX, incY, 0, 0);
   1515     }
   1516 
   1517     /**
   1518      * SSYR performs the rank 1 operation
   1519      * A := alpha*x*x**T + A
   1520      *
   1521      * Details: http://www.netlib.org/lapack/explore-html/d6/dac/ssyr_8f.html
   1522      *
   1523      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1524      * @param alpha The scalar alpha.
   1525      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1526      * @param incX The increment for the elements of vector x, must be larger than zero.
   1527      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1528      */
   1529     public void SSYR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation A) {
   1530         int N = validateSYR(Element.F32(mRS), Uplo, X, incX, A);
   1531         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), A.getID(mRS), 0.f, 0, incX, 0, 0, 0);
   1532     }
   1533 
   1534     /**
   1535      * SSPR performs the rank 1 operation
   1536      * A := alpha*x*x**T + A
   1537      *
   1538      * Details: http://www.netlib.org/lapack/explore-html/d2/d9b/sspr_8f.html
   1539      *
   1540      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1541      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1542      *       'a' to packed matrix 'b'.
   1543      *           k = 0
   1544      *           for i in range(0, n):
   1545      *              for j in range(i, n):
   1546      *                  b[k++] = a[i, j]
   1547      *
   1548      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   1549      * @param alpha The scalar alpha.
   1550      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1551      * @param incX The increment for the elements of vector x, must be larger than zero.
   1552      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1553      */
   1554     public void SSPR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Ap) {
   1555         int N = validateSPR(Element.F32(mRS), Uplo, X, incX, Ap);
   1556         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Ap.getID(mRS), 0.f, 0, incX, 0, 0, 0);
   1557     }
   1558 
   1559     /**
   1560      * SSYR2 performs the symmetric rank 2 operation
   1561      * A := alpha*x*y**T + alpha*y*x**T + A
   1562      *
   1563      * Details: http://www.netlib.org/lapack/explore-html/db/d99/ssyr2_8f.html
   1564      *
   1565      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1566      * @param alpha The scalar alpha.
   1567      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1568      * @param incX The increment for the elements of vector x, must be larger than zero.
   1569      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
   1570      * @param incY The increment for the elements of vector y, must be larger than zero.
   1571      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1572      */
   1573     public void SSYR2(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1574         int N = validateSYR2(Element.F32(mRS), Uplo, X, incX, Y, incY, A);
   1575         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, A.getID(mRS), incX, incY, 0, 0);
   1576     }
   1577 
   1578     /**
   1579      * SSPR2 performs the symmetric rank 2 operation
   1580      * A := alpha*x*y**T + alpha*y*x**T + A
   1581      *
   1582      * Details: http://www.netlib.org/lapack/explore-html/db/d3e/sspr2_8f.html
   1583      *
   1584      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1585      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1586      *       'a' to packed matrix 'b'.
   1587      *           k = 0
   1588      *           for i in range(0, n):
   1589      *              for j in range(i, n):
   1590      *                  b[k++] = a[i, j]
   1591      *
   1592      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   1593      * @param alpha The scalar alpha.
   1594      * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
   1595      * @param incX The increment for the elements of vector x, must be larger than zero.
   1596      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
   1597      * @param incY The increment for the elements of vector y, must be larger than zero.
   1598      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
   1599      */
   1600     public void SSPR2(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
   1601         int N = validateSPR2(Element.F32(mRS), Uplo, X, incX, Y, incY, Ap);
   1602         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, Ap.getID(mRS), incX, incY, 0, 0);
   1603     }
   1604 
   1605     /**
   1606      * DSYMV performs the matrix-vector operation
   1607      * y := alpha*A*x + beta*y
   1608      *
   1609      * Details: http://www.netlib.org/lapack/explore-html/d8/dbe/dsymv_8f.html
   1610      *
   1611      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1612      * @param alpha The scalar alpha.
   1613      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1614      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1615      * @param incX The increment for the elements of vector x, must be larger than zero.
   1616      * @param beta The scalar beta.
   1617      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
   1618      * @param incY The increment for the elements of vector y, must be larger than zero.
   1619      */
   1620     public void DSYMV(@Uplo int Uplo, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
   1621         int N = validateSYMV(Element.F64(mRS), Uplo, A, X, Y, incX, incY);
   1622         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsymv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
   1623     }
   1624 
   1625     /**
   1626      * DSBMV performs the matrix-vector operation
   1627      * y := alpha*A*x + beta*y
   1628      *
   1629      * Details: http://www.netlib.org/lapack/explore-html/d8/d1e/dsbmv_8f.html
   1630      *
   1631      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   1632      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   1633      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   1634      *           for i in range(0, n):
   1635      *              for j in range(i, min(i+k+1, n)):
   1636      *                  b[i, j-i] = a[i, j]
   1637      *
   1638      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
   1639      * @param K The number of off-diagonals of the matrix A
   1640      * @param alpha The scalar alpha.
   1641      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1642      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1643      * @param incX The increment for the elements of vector x, must be larger than zero.
   1644      * @param beta The scalar beta.
   1645      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
   1646      * @param incY The increment for the elements of vector y, must be larger than zero.
   1647      */
   1648     public void DSBMV(@Uplo int Uplo, int K, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
   1649         // SBMV is the same as SYMV + K >= 0
   1650         if (K < 0) {
   1651             throw new RSRuntimeException("K must be greater than or equal to 0");
   1652         }
   1653         int N = validateSYMV(Element.F64(mRS), Uplo, A, X, Y, incX, incY);
   1654         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha, A.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
   1655     }
   1656 
   1657     /**
   1658      * DSPMV performs the matrix-vector operation
   1659      * y := alpha*A*x + beta*y
   1660      *
   1661      * Details: http://www.netlib.org/lapack/explore-html/d4/d85/dspmv_8f.html
   1662      *
   1663      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1664      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1665      *       'a' to packed matrix 'b'.
   1666      *           k = 0
   1667      *           for i in range(0, n):
   1668      *              for j in range(i, n):
   1669      *                  b[k++] = a[i, j]
   1670      *
   1671      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
   1672      * @param alpha The scalar alpha.
   1673      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1674      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1675      * @param incX The increment for the elements of vector x, must be larger than zero.
   1676      * @param beta The scalar beta.
   1677      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
   1678      * @param incY The increment for the elements of vector y, must be larger than zero.
   1679      */
   1680     public void DSPMV(@Uplo int Uplo, double alpha, Allocation Ap, Allocation X, int incX, double beta, Allocation Y, int incY) {
   1681         int N = validateSPMV(Element.F64(mRS), Uplo, Ap, X, incX, Y, incY);
   1682         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, Ap.getID(mRS), X.getID(mRS), beta, Y.getID(mRS), incX, incY, 0, 0);
   1683     }
   1684 
   1685     /**
   1686      * DGER performs the rank 1 operation
   1687      * A := alpha*x*y**T + A
   1688      *
   1689      * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dger_8f.html
   1690      *
   1691      * @param alpha The scalar alpha.
   1692      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1693      * @param incX The increment for the elements of vector x, must be larger than zero.
   1694      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
   1695      * @param incY The increment for the elements of vector y, must be larger than zero.
   1696      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1697      */
   1698     public void DGER(double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1699         int M = A.getType().getY();
   1700         int N = A.getType().getX();
   1701         validateGER(Element.F64(mRS), X, incX, Y, incY, A);
   1702         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dger, 0, 0, 0, 0, 0, M, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0.f, A.getID(mRS), incX, incY, 0, 0);
   1703     }
   1704 
   1705     /**
   1706      * DSYR performs the rank 1 operation
   1707      * A := alpha*x*x**T + A
   1708      *
   1709      * Details: http://www.netlib.org/lapack/explore-html/d3/d60/dsyr_8f.html
   1710      *
   1711      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1712      * @param alpha The scalar alpha.
   1713      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1714      * @param incX The increment for the elements of vector x, must be larger than zero.
   1715      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1716      */
   1717     public void DSYR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation A) {
   1718         int N = validateSYR(Element.F64(mRS), Uplo, X, incX, A);
   1719         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), A.getID(mRS), 0.f, 0, incX, 0, 0, 0);
   1720     }
   1721 
   1722     /**
   1723      * DSPR performs the rank 1 operation
   1724      * A := alpha*x*x**T + A
   1725      *
   1726      * Details: http://www.netlib.org/lapack/explore-html/dd/dba/dspr_8f.html
   1727      *
   1728      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1729      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1730      *       'a' to packed matrix 'b'.
   1731      *           k = 0
   1732      *           for i in range(0, n):
   1733      *              for j in range(i, n):
   1734      *                  b[k++] = a[i, j]
   1735      *
   1736      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   1737      * @param alpha The scalar alpha.
   1738      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1739      * @param incX The increment for the elements of vector x, must be larger than zero.
   1740      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1741      */
   1742     public void DSPR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Ap) {
   1743         int N = validateSPR(Element.F64(mRS), Uplo, X, incX, Ap);
   1744         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Ap.getID(mRS), 0.f, 0, incX, 0, 0, 0);
   1745     }
   1746 
   1747     /**
   1748      * DSYR2 performs the symmetric rank 2 operation
   1749      * A := alpha*x*y**T + alpha*y*x**T + A
   1750      *
   1751      * Details: http://www.netlib.org/lapack/explore-html/de/d41/dsyr2_8f.html
   1752      *
   1753      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1754      * @param alpha The scalar alpha.
   1755      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1756      * @param incX The increment for the elements of vector x, must be larger than zero.
   1757      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
   1758      * @param incY The increment for the elements of vector y, must be larger than zero.
   1759      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1760      */
   1761     public void DSYR2(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1762         int N = validateSYR2(Element.F64(mRS), Uplo, X, incX, Y, incY, A);
   1763         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, A.getID(mRS), incX, incY, 0, 0);
   1764     }
   1765 
   1766     /**
   1767      * DSPR2 performs the symmetric rank 2 operation
   1768      * A := alpha*x*y**T + alpha*y*x**T + A
   1769      *
   1770      * Details: http://www.netlib.org/lapack/explore-html/dd/d9e/dspr2_8f.html
   1771      *
   1772      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1773      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1774      *       'a' to packed matrix 'b'.
   1775      *           k = 0
   1776      *           for i in range(0, n):
   1777      *              for j in range(i, n):
   1778      *                  b[k++] = a[i, j]
   1779      *
   1780      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   1781      * @param alpha The scalar alpha.
   1782      * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
   1783      * @param incX The increment for the elements of vector x, must be larger than zero.
   1784      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
   1785      * @param incY The increment for the elements of vector y, must be larger than zero.
   1786      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
   1787      */
   1788     public void DSPR2(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
   1789         int N = validateSPR2(Element.F64(mRS), Uplo, X, incX, Y, incY, Ap);
   1790         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, X.getID(mRS), Y.getID(mRS), 0, Ap.getID(mRS), incX, incY, 0, 0);
   1791     }
   1792 
   1793 
   1794     /**
   1795      * Level 2, C and Z only
   1796      */
   1797 
   1798     static void validateGERU(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1799         if (!A.getType().getElement().isCompatible(e) ||
   1800             !X.getType().getElement().isCompatible(e) ||
   1801             !Y.getType().getElement().isCompatible(e)) {
   1802             throw new RSRuntimeException("Called BLAS with wrong Element type");
   1803         }
   1804         if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
   1805             throw new RSRuntimeException("BLAS vectors must have Y dimension of 0 or 1");
   1806         }
   1807 
   1808         int M = A.getType().getY();
   1809         int N = A.getType().getX();
   1810         if (incX <= 0 || incY <= 0) {
   1811             throw new RSRuntimeException("Vector increments must be greater than 0");
   1812         }
   1813         int expectedXDim = 1 + (M - 1) * incX;
   1814         if (X.getType().getX() != expectedXDim) {
   1815             throw new RSRuntimeException("Incorrect vector dimensions for GERU");
   1816         }
   1817         int expectedYDim = 1 + (N - 1) * incY;
   1818         if (Y.getType().getX() != expectedYDim) {
   1819             throw new RSRuntimeException("Incorrect vector dimensions for GERU");
   1820         }
   1821 
   1822     }
   1823 
   1824     /**
   1825      * CHEMV performs the matrix-vector operation
   1826      * y := alpha*A*x + beta*y
   1827      *
   1828      * Details: http://www.netlib.org/lapack/explore-html/d7/d51/chemv_8f.html
   1829      *
   1830      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1831      * @param alpha The scalar alpha.
   1832      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1833      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1834      * @param incX The increment for the elements of vector x, must be larger than zero.
   1835      * @param beta The scalar beta.
   1836      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   1837      * @param incY The increment for the elements of vector y, must be larger than zero.
   1838      */
   1839     public void CHEMV(@Uplo int Uplo, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
   1840         // HEMV is the same as SYR2 validation-wise
   1841         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
   1842         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chemv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
   1843     }
   1844 
   1845     /**
   1846      * CHBMV performs the matrix-vector operation
   1847      * y := alpha*A*x + beta*y
   1848      *
   1849      * Details: http://www.netlib.org/lapack/explore-html/db/dc2/chbmv_8f.html
   1850      *
   1851      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   1852      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   1853      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   1854      *           for i in range(0, n):
   1855      *              for j in range(i, min(i+k+1, n)):
   1856      *                  b[i, j-i] = a[i, j]
   1857      *
   1858      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
   1859      * @param K The number of off-diagonals of the matrix A
   1860      * @param alpha The scalar alpha.
   1861      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1862      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1863      * @param incX The increment for the elements of vector x, must be larger than zero.
   1864      * @param beta The scalar beta.
   1865      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   1866      * @param incY The increment for the elements of vector y, must be larger than zero.
   1867      */
   1868     public void CHBMV(@Uplo int Uplo, int K, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
   1869         // HBMV is the same as SYR2 validation-wise
   1870         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
   1871         if (K < 0) {
   1872             throw new RSRuntimeException("K must be 0 or greater for HBMV");
   1873         }
   1874         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
   1875     }
   1876 
   1877     /**
   1878      * CHPMV performs the matrix-vector operation
   1879      * y := alpha*A*x + beta*y
   1880      *
   1881      * Details: http://www.netlib.org/lapack/explore-html/d2/d06/chpmv_8f.html
   1882      *
   1883      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1884      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1885      *       'a' to packed matrix 'b'.
   1886      *           k = 0
   1887      *           for i in range(0, n):
   1888      *              for j in range(i, n):
   1889      *                  b[k++] = a[i, j]
   1890      *
   1891      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
   1892      * @param alpha The scalar alpha.
   1893      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1894      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1895      * @param incX The increment for the elements of vector x, must be larger than zero.
   1896      * @param beta The scalar beta.
   1897      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   1898      * @param incY The increment for the elements of vector y, must be larger than zero.
   1899      */
   1900     public void CHPMV(@Uplo int Uplo, Float2 alpha, Allocation Ap, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
   1901         // HPMV is the same as SPR2
   1902         int N = validateSPR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
   1903         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, Ap.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
   1904     }
   1905 
   1906     /**
   1907      * CGERU performs the rank 1 operation
   1908      * A := alpha*x*y**T + A
   1909      *
   1910      * Details: http://www.netlib.org/lapack/explore-html/db/d5f/cgeru_8f.html
   1911      *
   1912      * @param alpha The scalar alpha.
   1913      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1914      * @param incX The increment for the elements of vector x, must be larger than zero.
   1915      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   1916      * @param incY The increment for the elements of vector y, must be larger than zero.
   1917      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1918      */
   1919     public void CGERU(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1920         validateGERU(Element.F32_2(mRS), X, incX, Y, incY, A);
   1921         int M = A.getType().getY();
   1922         int N = A.getType().getX();
   1923         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgeru, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
   1924     }
   1925 
   1926     /**
   1927      * CGERC performs the rank 1 operation
   1928      * A := alpha*x*y**H + A
   1929      *
   1930      * Details: http://www.netlib.org/lapack/explore-html/dd/d84/cgerc_8f.html
   1931      *
   1932      * @param alpha The scalar alpha.
   1933      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1934      * @param incX The increment for the elements of vector x, must be larger than zero.
   1935      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   1936      * @param incY The increment for the elements of vector y, must be larger than zero.
   1937      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1938      */
   1939     public void CGERC(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   1940         // same as GERU
   1941         validateGERU(Element.F32_2(mRS), X, incX, Y, incY, A);
   1942         int M = A.getType().getY();
   1943         int N = A.getType().getX();
   1944         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgerc, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
   1945     }
   1946 
   1947     /**
   1948      * CHER performs the rank 1 operation
   1949      * A := alpha*x*x**H + A
   1950      *
   1951      * Details: http://www.netlib.org/lapack/explore-html/d3/d6d/cher_8f.html
   1952      *
   1953      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1954      * @param alpha The scalar alpha.
   1955      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1956      * @param incX The increment for the elements of vector x, must be larger than zero.
   1957      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1958      */
   1959     public void CHER(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation A) {
   1960         // same as SYR
   1961         int N = validateSYR(Element.F32_2(mRS), Uplo, X, incX, A);
   1962         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, A.getID(mRS), incX, 0, 0, 0);
   1963     }
   1964 
   1965     /**
   1966      * CHPR performs the rank 1 operation
   1967      * A := alpha*x*x**H + A
   1968      *
   1969      * Details: http://www.netlib.org/lapack/explore-html/db/dcd/chpr_8f.html
   1970      *
   1971      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   1972      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   1973      *       'a' to packed matrix 'b'.
   1974      *           k = 0
   1975      *           for i in range(0, n):
   1976      *              for j in range(i, n):
   1977      *                  b[k++] = a[i, j]
   1978      *
   1979      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   1980      * @param alpha The scalar alpha.
   1981      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   1982      * @param incX The increment for the elements of vector x, must be larger than zero.
   1983      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   1984      */
   1985     public void CHPR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Ap) {
   1986         // equivalent to SPR for validation
   1987         int N = validateSPR(Element.F32_2(mRS), Uplo, X, incX, Ap);
   1988         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, Ap.getID(mRS), incX, 0, 0, 0);
   1989     }
   1990 
   1991     /**
   1992      * CHER2 performs the symmetric rank 2 operation
   1993      * A := alpha*x*y**H + alpha*y*x**H + A
   1994      *
   1995      * Details: http://www.netlib.org/lapack/explore-html/db/d87/cher2_8f.html
   1996      *
   1997      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   1998      * @param alpha The scalar alpha.
   1999      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   2000      * @param incX The increment for the elements of vector x, must be larger than zero.
   2001      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   2002      * @param incY The increment for the elements of vector y, must be larger than zero.
   2003      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2004      */
   2005     public void CHER2(@Uplo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   2006         // same as SYR2
   2007         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
   2008         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
   2009     }
   2010 
   2011     /**
   2012      * CHPR2 performs the symmetric rank 2 operation
   2013      * A := alpha*x*y**H + alpha*y*x**H + A
   2014      *
   2015      * Details: http://www.netlib.org/lapack/explore-html/d6/d44/chpr2_8f.html
   2016      *
   2017      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   2018      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   2019      *       'a' to packed matrix 'b'.
   2020      *           k = 0
   2021      *           for i in range(0, n):
   2022      *              for j in range(i, n):
   2023      *                  b[k++] = a[i, j]
   2024      *
   2025      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   2026      * @param alpha The scalar alpha.
   2027      * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
   2028      * @param incX The increment for the elements of vector x, must be larger than zero.
   2029      * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
   2030      * @param incY The increment for the elements of vector y, must be larger than zero.
   2031      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2032      */
   2033     public void CHPR2(@Uplo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
   2034         // same as SPR2
   2035         int N = validateSPR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
   2036         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, Ap.getID(mRS), incX, incY, 0, 0);
   2037     }
   2038 
   2039     /**
   2040      * ZHEMV performs the matrix-vector operation
   2041      * y := alpha*A*x + beta*y
   2042      *
   2043      * Details: http://www.netlib.org/lapack/explore-html/d0/ddd/zhemv_8f.html
   2044      *
   2045      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2046      * @param alpha The scalar alpha.
   2047      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2048      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2049      * @param incX The increment for the elements of vector x, must be larger than zero.
   2050      * @param beta The scalar beta.
   2051      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2052      * @param incY The increment for the elements of vector y, must be larger than zero.
   2053      */
   2054     public void ZHEMV(@Uplo int Uplo, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
   2055         // HEMV is the same as SYR2 validation-wise
   2056         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
   2057         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhemv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
   2058     }
   2059 
   2060     /**
   2061      * ZHBMV performs the matrix-vector operation
   2062      * y := alpha*A*x + beta*y
   2063      *
   2064      * Details: http://www.netlib.org/lapack/explore-html/d3/d1a/zhbmv_8f.html
   2065      *
   2066      * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
   2067      *       but only the region N*(K+1) will be referenced. The following subroutine can is an
   2068      *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
   2069      *           for i in range(0, n):
   2070      *              for j in range(i, min(i+k+1, n)):
   2071      *                  b[i, j-i] = a[i, j]
   2072      *
   2073      * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
   2074      * @param K The number of off-diagonals of the matrix A
   2075      * @param alpha The scalar alpha.
   2076      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2077      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2078      * @param incX The increment for the elements of vector x, must be larger than zero.
   2079      * @param beta The scalar beta.
   2080      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2081      * @param incY The increment for the elements of vector y, must be larger than zero.
   2082      */
   2083     public void ZHBMV(@Uplo int Uplo, int K, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
   2084         // HBMV is the same as SYR2 validation-wise
   2085         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
   2086         if (K < 0) {
   2087             throw new RSRuntimeException("K must be 0 or greater for HBMV");
   2088         }
   2089         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha.x, alpha.y, A.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
   2090     }
   2091 
   2092     /**
   2093      * ZHPMV performs the matrix-vector operation
   2094      * y := alpha*A*x + beta*y
   2095      *
   2096      * Details: http://www.netlib.org/lapack/explore-html/d0/d60/zhpmv_8f.html
   2097      *
   2098      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   2099      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   2100      *       'a' to packed matrix 'b'.
   2101      *           k = 0
   2102      *           for i in range(0, n):
   2103      *              for j in range(i, n):
   2104      *                  b[k++] = a[i, j]
   2105      *
   2106      * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
   2107      * @param alpha The scalar alpha.
   2108      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2109      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2110      * @param incX The increment for the elements of vector x, must be larger than zero.
   2111      * @param beta The scalar beta.
   2112      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2113      * @param incY The increment for the elements of vector y, must be larger than zero.
   2114      */
   2115     public void ZHPMV(@Uplo int Uplo, Double2 alpha, Allocation Ap, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
   2116         // HPMV is the same as SPR2
   2117         int N = validateSPR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
   2118         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, Ap.getID(mRS), X.getID(mRS), beta.x, beta.y, Y.getID(mRS), incX, incY, 0, 0);
   2119     }
   2120 
   2121     /**
   2122      * ZGERU performs the rank 1 operation
   2123      * A := alpha*x*y**T + A
   2124      *
   2125      * Details: http://www.netlib.org/lapack/explore-html/d7/d12/zgeru_8f.html
   2126      *
   2127      * @param alpha The scalar alpha.
   2128      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2129      * @param incX The increment for the elements of vector x, must be larger than zero.
   2130      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2131      * @param incY The increment for the elements of vector y, must be larger than zero.
   2132      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2133      */
   2134     public void ZGERU(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   2135         validateGERU(Element.F64_2(mRS), X, incX, Y, incY, A);
   2136         int M = A.getType().getY();
   2137         int N = A.getType().getX();
   2138         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgeru, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
   2139     }
   2140 
   2141     /**
   2142      * ZGERC performs the rank 1 operation
   2143      * A := alpha*x*y**H + A
   2144      *
   2145      * Details: http://www.netlib.org/lapack/explore-html/d3/dad/zgerc_8f.html
   2146      *
   2147      * @param alpha The scalar alpha.
   2148      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2149      * @param incX The increment for the elements of vector x, must be larger than zero.
   2150      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2151      * @param incY The increment for the elements of vector y, must be larger than zero.
   2152      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2153      */
   2154     public void ZGERC(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   2155         // same as GERU
   2156         validateGERU(Element.F64_2(mRS), X, incX, Y, incY, A);
   2157         int M = A.getType().getY();
   2158         int N = A.getType().getX();
   2159         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgerc, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
   2160     }
   2161 
   2162     /**
   2163      * ZHER performs the rank 1 operation
   2164      * A := alpha*x*x**H + A
   2165      *
   2166      * Details: http://www.netlib.org/lapack/explore-html/de/d0e/zher_8f.html
   2167      *
   2168      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2169      * @param alpha The scalar alpha.
   2170      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2171      * @param incX The increment for the elements of vector x, must be larger than zero.
   2172      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2173      */
   2174     public void ZHER(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation A) {
   2175         // same as SYR
   2176         int N = validateSYR(Element.F64_2(mRS), Uplo, X, incX, A);
   2177         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, A.getID(mRS), incX, 0, 0, 0);
   2178     }
   2179 
   2180     /**
   2181      * ZHPR performs the rank 1 operation
   2182      * A := alpha*x*x**H + A
   2183      *
   2184      * Details: http://www.netlib.org/lapack/explore-html/de/de1/zhpr_8f.html
   2185      *
   2186      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   2187      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   2188      *       'a' to packed matrix 'b'.
   2189      *           k = 0
   2190      *           for i in range(0, n):
   2191      *              for j in range(i, n):
   2192      *                  b[k++] = a[i, j]
   2193      *
   2194      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   2195      * @param alpha The scalar alpha.
   2196      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2197      * @param incX The increment for the elements of vector x, must be larger than zero.
   2198      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2199      */
   2200     public void ZHPR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Ap) {
   2201         // equivalent to SPR for validation
   2202         int N = validateSPR(Element.F64_2(mRS), Uplo, X, incX, Ap);
   2203         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, X.getID(mRS), 0, 0, 0, Ap.getID(mRS), incX, 0, 0, 0);
   2204     }
   2205 
   2206     /**
   2207      * ZHER2 performs the symmetric rank 2 operation
   2208      * A := alpha*x*y**H + alpha*y*x**H + A
   2209      *
   2210      * Details: http://www.netlib.org/lapack/explore-html/da/d8a/zher2_8f.html
   2211      *
   2212      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2213      * @param alpha The scalar alpha.
   2214      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2215      * @param incX The increment for the elements of vector x, must be larger than zero.
   2216      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2217      * @param incY The increment for the elements of vector y, must be larger than zero.
   2218      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2219      */
   2220     public void ZHER2(@Uplo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
   2221         // same as SYR2
   2222         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
   2223         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, A.getID(mRS), incX, incY, 0, 0);
   2224     }
   2225 
   2226     /**
   2227      * ZHPR2 performs the symmetric rank 2 operation
   2228      * A := alpha*x*y**H + alpha*y*x**H + A
   2229      *
   2230      * Details: http://www.netlib.org/lapack/explore-html/d5/d52/zhpr2_8f.html
   2231      *
   2232      * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
   2233      *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
   2234      *       'a' to packed matrix 'b'.
   2235      *           k = 0
   2236      *           for i in range(0, n):
   2237      *              for j in range(i, n):
   2238      *                  b[k++] = a[i, j]
   2239      *
   2240      * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
   2241      * @param alpha The scalar alpha.
   2242      * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
   2243      * @param incX The increment for the elements of vector x, must be larger than zero.
   2244      * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
   2245      * @param incY The increment for the elements of vector y, must be larger than zero.
   2246      * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2247      */
   2248     public void ZHPR2(@Uplo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
   2249         // same as SPR2
   2250         int N = validateSPR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
   2251         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, X.getID(mRS), Y.getID(mRS), 0, 0, Ap.getID(mRS), incX, incY, 0, 0);
   2252     }
   2253 
   2254 
   2255     /**
   2256      * Level 3 BLAS
   2257      */
   2258 
   2259     static void validateL3(Element e, int TransA, int TransB, int Side, Allocation A, Allocation B, Allocation C) {
   2260         int aM = -1, aN = -1, bM = -1, bN = -1, cM = -1, cN = -1;
   2261         if ((A != null && !A.getType().getElement().isCompatible(e)) ||
   2262             (B != null && !B.getType().getElement().isCompatible(e)) ||
   2263             (C != null && !C.getType().getElement().isCompatible(e))) {
   2264             throw new RSRuntimeException("Called BLAS with wrong Element type");
   2265         }
   2266         if (C == null) {
   2267             //since matrix C is used to store the result, it cannot be null.
   2268             throw new RSRuntimeException("Allocation C cannot be null");
   2269         }
   2270         cM = C.getType().getY();
   2271         cN = C.getType().getX();
   2272 
   2273         if (Side == RIGHT) {
   2274             if ((A == null && B != null) || (A != null && B == null)) {
   2275                 throw new RSRuntimeException("Provided Matrix A without Matrix B, or vice versa");
   2276             }
   2277             if (B != null) {
   2278                 bM = A.getType().getY();
   2279                 bN = A.getType().getX();
   2280             }
   2281             if (A != null) {
   2282                 aM = B.getType().getY();
   2283                 aN = B.getType().getX();
   2284             }
   2285         } else {
   2286             if (A != null) {
   2287                 if (TransA == TRANSPOSE || TransA == CONJ_TRANSPOSE) {
   2288                     aN = A.getType().getY();
   2289                     aM = A.getType().getX();
   2290                 } else {
   2291                     aM = A.getType().getY();
   2292                     aN = A.getType().getX();
   2293                 }
   2294             }
   2295             if (B != null) {
   2296                 if (TransB == TRANSPOSE || TransB == CONJ_TRANSPOSE) {
   2297                     bN = B.getType().getY();
   2298                     bM = B.getType().getX();
   2299                 } else {
   2300                     bM = B.getType().getY();
   2301                     bN = B.getType().getX();
   2302                 }
   2303             }
   2304         }
   2305         if (A != null && B != null && C != null) {
   2306             if (aN != bM || aM != cM || bN != cN) {
   2307                 throw new RSRuntimeException("Called BLAS with invalid dimensions");
   2308             }
   2309         } else if (A != null && C != null) {
   2310             // A and C only, for SYRK
   2311             if (cM != cN) {
   2312                 throw new RSRuntimeException("Matrix C is not symmetric");
   2313             }
   2314             if (aM != cM) {
   2315                 throw new RSRuntimeException("Called BLAS with invalid dimensions");
   2316             }
   2317         } else if (A != null && B != null) {
   2318             // A and B only
   2319             if (aN != bM) {
   2320                 throw new RSRuntimeException("Called BLAS with invalid dimensions");
   2321             }
   2322         }
   2323 
   2324     }
   2325 
   2326     /**
   2327      * SGEMM performs one of the matrix-matrix operations
   2328      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
   2329      *
   2330      * Details: http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
   2331      *
   2332      * @param TransA The type of transpose applied to matrix A.
   2333      * @param TransB The type of transpose applied to matrix B.
   2334      * @param alpha The scalar alpha.
   2335      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   2336      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
   2337      * @param beta The scalar beta.
   2338      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
   2339      */
   2340     public void SGEMM(@Transpose int TransA, @Transpose int TransB, float alpha, Allocation A,
   2341                       Allocation B, float beta, Allocation C) {
   2342         validateTranspose(TransA);
   2343         validateTranspose(TransB);
   2344         validateL3(Element.F32(mRS), TransA, TransB, 0, A, B, C);
   2345 
   2346         int M = -1, N = -1, K = -1;
   2347         if (TransA != NO_TRANSPOSE) {
   2348             M = A.getType().getX();
   2349             K = A.getType().getY();
   2350         } else {
   2351             M = A.getType().getY();
   2352             K = A.getType().getX();
   2353         }
   2354         if (TransB != NO_TRANSPOSE) {
   2355             N = B.getType().getY();
   2356         } else {
   2357             N = B.getType().getX();
   2358         }
   2359         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha, A.getID(mRS), B.getID(mRS),
   2360                                         beta, C.getID(mRS), 0, 0, 0, 0);
   2361     }
   2362 
   2363     /**
   2364      * DGEMM performs one of the matrix-matrix operations
   2365      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
   2366      *
   2367      * Details: http://www.netlib.org/lapack/explore-html/d7/d2b/dgemm_8f.html
   2368      *
   2369      * @param TransA The type of transpose applied to matrix A.
   2370      * @param TransB The type of transpose applied to matrix B.
   2371      * @param alpha The scalar alpha.
   2372      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   2373      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
   2374      * @param beta The scalar beta.
   2375      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
   2376      */
   2377     public void DGEMM(@Transpose int TransA, @Transpose int TransB, double alpha, Allocation A,
   2378                       Allocation B, double beta, Allocation C) {
   2379         validateTranspose(TransA);
   2380         validateTranspose(TransB);
   2381         validateL3(Element.F64(mRS), TransA, TransB, 0, A, B, C);
   2382         int M = -1, N = -1, K = -1;
   2383         if (TransA != NO_TRANSPOSE) {
   2384             M = A.getType().getX();
   2385             K = A.getType().getY();
   2386         } else {
   2387             M = A.getType().getY();
   2388             K = A.getType().getX();
   2389         }
   2390         if (TransB != NO_TRANSPOSE) {
   2391             N = B.getType().getY();
   2392         } else {
   2393             N = B.getType().getX();
   2394         }
   2395         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha, A.getID(mRS), B.getID(mRS),
   2396                                         beta, C.getID(mRS), 0, 0, 0, 0);
   2397     }
   2398 
   2399     /**
   2400      * CGEMM performs one of the matrix-matrix operations
   2401      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
   2402      *
   2403      * Details: http://www.netlib.org/lapack/explore-html/d6/d5b/cgemm_8f.html
   2404      *
   2405      * @param TransA The type of transpose applied to matrix A.
   2406      * @param TransB The type of transpose applied to matrix B.
   2407      * @param alpha The scalar alpha.
   2408      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2409      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   2410      * @param beta The scalar beta.
   2411      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   2412      */
   2413     public void CGEMM(@Transpose int TransA, @Transpose int TransB, Float2 alpha, Allocation A,
   2414                       Allocation B, Float2 beta, Allocation C) {
   2415         validateTranspose(TransA);
   2416         validateTranspose(TransB);
   2417         validateL3(Element.F32_2(mRS), TransA, TransB, 0, A, B, C);
   2418         int M = -1, N = -1, K = -1;
   2419         if (TransA != NO_TRANSPOSE) {
   2420             M = A.getType().getX();
   2421             K = A.getType().getY();
   2422         } else {
   2423             M = A.getType().getY();
   2424             K = A.getType().getX();
   2425         }
   2426         if (TransB != NO_TRANSPOSE) {
   2427             N = B.getType().getY();
   2428         } else {
   2429             N = B.getType().getX();
   2430         }
   2431         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
   2432                                          beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   2433     }
   2434 
   2435     /**
   2436      * ZGEMM performs one of the matrix-matrix operations
   2437      * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
   2438      *
   2439      * Details: http://www.netlib.org/lapack/explore-html/d7/d76/zgemm_8f.html
   2440      *
   2441      * @param TransA The type of transpose applied to matrix A.
   2442      * @param TransB The type of transpose applied to matrix B.
   2443      * @param alpha The scalar alpha.
   2444      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2445      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   2446      * @param beta The scalar beta.
   2447      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   2448      */
   2449     public void ZGEMM(@Transpose int TransA, @Transpose int TransB, Double2 alpha, Allocation A,
   2450                       Allocation B, Double2 beta, Allocation C) {
   2451         validateTranspose(TransA);
   2452         validateTranspose(TransB);
   2453         validateL3(Element.F64_2(mRS), TransA, TransB, 0, A, B, C);
   2454         int M = -1, N = -1, K = -1;
   2455         if (TransA != NO_TRANSPOSE) {
   2456             M = A.getType().getX();
   2457             K = A.getType().getY();
   2458         } else {
   2459             M = A.getType().getY();
   2460             K = A.getType().getX();
   2461         }
   2462         if (TransB != NO_TRANSPOSE) {
   2463             N = B.getType().getY();
   2464         } else {
   2465             N = B.getType().getX();
   2466         }
   2467         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
   2468                                    beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   2469     }
   2470 
   2471     /**
   2472      * SSYMM performs one of the matrix-matrix operations
   2473      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
   2474      *
   2475      * Details: http://www.netlib.org/lapack/explore-html/d7/d42/ssymm_8f.html
   2476      *
   2477      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2478      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2479      * @param alpha The scalar alpha.
   2480      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   2481      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
   2482      * @param beta The scalar beta.
   2483      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
   2484      */
   2485     public void SSYMM(@Side int Side, @Uplo int Uplo, float alpha, Allocation A,
   2486                       Allocation B, float beta, Allocation C) {
   2487         validateSide(Side);
   2488         validateUplo(Uplo);
   2489         //For SYMM, Matrix A should be symmetric
   2490         if (A.getType().getX() != A.getType().getY()) {
   2491             throw new RSRuntimeException("Matrix A is not symmetric");
   2492         }
   2493         validateL3(Element.F32(mRS), 0, 0, Side, A, B, C);
   2494         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha, A.getID(mRS), B.getID(mRS),
   2495                                         beta, C.getID(mRS), 0, 0, 0, 0);
   2496     }
   2497 
   2498     /**
   2499      * DSYMM performs one of the matrix-matrix operations
   2500      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
   2501      *
   2502      * Details: http://www.netlib.org/lapack/explore-html/d8/db0/dsymm_8f.html
   2503      *
   2504      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2505      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2506      * @param alpha The scalar alpha.
   2507      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   2508      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
   2509      * @param beta The scalar beta.
   2510      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
   2511      */
   2512     public void DSYMM(@Side int Side, @Uplo int Uplo, double alpha, Allocation A,
   2513                       Allocation B, double beta, Allocation C) {
   2514         validateSide(Side);
   2515         validateUplo(Uplo);
   2516         if (A.getType().getX() != A.getType().getY()) {
   2517             throw new RSRuntimeException("Matrix A is not symmetric");
   2518         }
   2519         validateL3(Element.F64(mRS), 0, 0, Side, A, B, C);
   2520         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha, A.getID(mRS), B.getID(mRS),
   2521                                         beta, C.getID(mRS), 0, 0, 0, 0);
   2522     }
   2523 
   2524     /**
   2525      * CSYMM performs one of the matrix-matrix operations
   2526      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
   2527      *
   2528      * Details: http://www.netlib.org/lapack/explore-html/db/d59/csymm_8f.html
   2529      *
   2530      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2531      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2532      * @param alpha The scalar alpha.
   2533      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2534      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   2535      * @param beta The scalar beta.
   2536      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   2537      */
   2538     public void CSYMM(@Side int Side, @Uplo int Uplo, Float2 alpha, Allocation A,
   2539                       Allocation B, Float2 beta, Allocation C) {
   2540         validateSide(Side);
   2541         validateUplo(Uplo);
   2542         if (A.getType().getX() != A.getType().getY()) {
   2543             throw new RSRuntimeException("Matrix A is not symmetric");
   2544         }
   2545         validateL3(Element.F32_2(mRS), 0, 0, Side, A, B, C);
   2546         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
   2547                                          beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   2548     }
   2549 
   2550     /**
   2551      * ZSYMM performs one of the matrix-matrix operations
   2552      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
   2553      *
   2554      * Details: http://www.netlib.org/lapack/explore-html/df/d51/zsymm_8f.html
   2555      *
   2556      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2557      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   2558      * @param alpha The scalar alpha.
   2559      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2560      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   2561      * @param beta The scalar beta.
   2562      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   2563      */
   2564     public void ZSYMM(@Side int Side, @Uplo int Uplo, Double2 alpha, Allocation A,
   2565                       Allocation B, Double2 beta, Allocation C) {
   2566         validateSide(Side);
   2567         validateUplo(Uplo);
   2568         if (A.getType().getX() != A.getType().getY()) {
   2569             throw new RSRuntimeException("Matrix A is not symmetric");
   2570         }
   2571         validateL3(Element.F64_2(mRS), 0, 0, Side, A, B, C);
   2572         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zsymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS),
   2573                                    beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   2574     }
   2575 
   2576     /**
   2577      * SSYRK performs one of the symmetric rank k operations
   2578      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
   2579      *
   2580      * Details: http://www.netlib.org/lapack/explore-html/d0/d40/ssyrk_8f.html
   2581      *
   2582      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2583      * @param Trans The type of transpose applied to the operation.
   2584      * @param alpha The scalar alpha.
   2585      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   2586      * @param beta The scalar beta.
   2587      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
   2588      */
   2589     public void SSYRK(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C) {
   2590         validateTranspose(Trans);
   2591         validateUplo(Uplo);
   2592         validateL3(Element.F32(mRS), Trans, 0, 0, A, null, C);
   2593         int K = -1;
   2594         if (Trans != NO_TRANSPOSE) {
   2595             K = A.getType().getY();
   2596         } else {
   2597             K = A.getType().getX();
   2598         }
   2599 
   2600         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), 0, beta, C.getID(mRS), 0, 0, 0, 0);
   2601     }
   2602 
   2603     /**
   2604      * DSYRK performs one of the symmetric rank k operations
   2605      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
   2606      *
   2607      * Details: http://www.netlib.org/lapack/explore-html/dc/d05/dsyrk_8f.html
   2608      *
   2609      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2610      * @param Trans The type of transpose applied to the operation.
   2611      * @param alpha The scalar alpha.
   2612      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   2613      * @param beta The scalar beta.
   2614      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
   2615      */
   2616     public void DSYRK(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C) {
   2617         validateTranspose(Trans);
   2618         validateUplo(Uplo);
   2619         validateL3(Element.F64(mRS), Trans, 0, 0, A, null, C);
   2620         int K = -1;
   2621         if (Trans != NO_TRANSPOSE) {
   2622             K = A.getType().getY();
   2623         } else {
   2624             K = A.getType().getX();
   2625         }
   2626         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), 0, beta, C.getID(mRS), 0, 0, 0, 0);
   2627     }
   2628 
   2629     /**
   2630      * CSYRK performs one of the symmetric rank k operations
   2631      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
   2632      *
   2633      * Details: http://www.netlib.org/lapack/explore-html/d3/d6a/csyrk_8f.html
   2634      *
   2635      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2636      * @param Trans The type of transpose applied to the operation.
   2637      * @param alpha The scalar alpha.
   2638      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2639      * @param beta The scalar beta.
   2640      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   2641      */
   2642     public void CSYRK(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Float2 beta, Allocation C) {
   2643         validateTranspose(Trans);
   2644         validateUplo(Uplo);
   2645         validateL3(Element.F32_2(mRS), Trans, 0, 0, A, null, C);
   2646         int K = -1;
   2647         if (Trans != NO_TRANSPOSE) {
   2648             K = A.getType().getY();
   2649         } else {
   2650             K = A.getType().getX();
   2651         }
   2652         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), 0, beta.x, beta.y,
   2653                                          C.getID(mRS), 0, 0, 0, 0);
   2654     }
   2655 
   2656     /**
   2657      * ZSYRK performs one of the symmetric rank k operations
   2658      * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
   2659      *
   2660      * Details: http://www.netlib.org/lapack/explore-html/de/d54/zsyrk_8f.html
   2661      *
   2662      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2663      * @param Trans The type of transpose applied to the operation.
   2664      * @param alpha The scalar alpha.
   2665      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2666      * @param beta The scalar beta.
   2667      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   2668      */
   2669     public void ZSYRK(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Double2 beta, Allocation C) {
   2670         validateTranspose(Trans);
   2671         validateUplo(Uplo);
   2672         validateL3(Element.F64_2(mRS), Trans, 0, 0, A, null, C);
   2673         int K = -1;
   2674         if (Trans != NO_TRANSPOSE) {
   2675             K = A.getType().getY();
   2676         } else {
   2677             K = A.getType().getX();
   2678         }
   2679         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zsyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), 0, beta.x, beta.y,
   2680                                    C.getID(mRS), 0, 0, 0, 0);
   2681     }
   2682 
   2683     static void validateSYR2K(Element e, @Transpose int Trans, Allocation A, Allocation B, Allocation C) {
   2684         validateTranspose(Trans);
   2685         if (!A.getType().getElement().isCompatible(e) ||
   2686             !B.getType().getElement().isCompatible(e) ||
   2687             !C.getType().getElement().isCompatible(e)) {
   2688             throw new RSRuntimeException("Called BLAS with wrong Element type");
   2689         }
   2690         int Cdim = -1;
   2691         // A is n x k if no transpose, k x n if transpose
   2692         // C is n x n
   2693         if (Trans == TRANSPOSE) {
   2694             // check columns versus C
   2695             Cdim = A.getType().getX();
   2696         } else {
   2697             // check rows versus C
   2698             Cdim = A.getType().getY();
   2699         }
   2700         if (C.getType().getX() != Cdim || C.getType().getY() != Cdim) {
   2701             throw new RSRuntimeException("Invalid symmetric matrix in SYR2K");
   2702         }
   2703         // A dims == B dims
   2704         if (A.getType().getX() != B.getType().getX() || A.getType().getY() != B.getType().getY()) {
   2705             throw new RSRuntimeException("Invalid A and B in SYR2K");
   2706         }
   2707     }
   2708 
   2709     /**
   2710      * SSYR2K performs one of the symmetric rank 2k operations
   2711      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
   2712      *
   2713      * Details: http://www.netlib.org/lapack/explore-html/df/d3d/ssyr2k_8f.html
   2714      *
   2715      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2716      * @param Trans The type of transpose applied to the operation.
   2717      * @param alpha The scalar alpha.
   2718      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   2719      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
   2720      * @param beta The scalar beta.
   2721      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
   2722      */
   2723     public void SSYR2K(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, Allocation B, float beta, Allocation C) {
   2724         validateUplo(Uplo);
   2725         validateSYR2K(Element.F32(mRS), Trans, A, B, C);
   2726         int K = -1;
   2727         if (Trans != NO_TRANSPOSE) {
   2728             K = A.getType().getY();
   2729         } else {
   2730             K = A.getType().getX();
   2731         }
   2732         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), B.getID(mRS), beta, C.getID(mRS), 0, 0, 0, 0);
   2733     }
   2734 
   2735     /**
   2736      * DSYR2K performs one of the symmetric rank 2k operations
   2737      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
   2738      *
   2739      * Details: http://www.netlib.org/lapack/explore-html/d1/dec/dsyr2k_8f.html
   2740      *
   2741      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2742      * @param Trans The type of transpose applied to the operation.
   2743      * @param alpha The scalar alpha.
   2744      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   2745      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
   2746      * @param beta The scalar beta.
   2747      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
   2748      */
   2749     public void DSYR2K(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, Allocation B, double beta, Allocation C) {
   2750         validateUplo(Uplo);
   2751         validateSYR2K(Element.F64(mRS), Trans, A, B, C);
   2752         int K = -1;
   2753         if (Trans != NO_TRANSPOSE) {
   2754             K = A.getType().getY();
   2755         } else {
   2756             K = A.getType().getX();
   2757         }
   2758         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, A.getID(mRS), B.getID(mRS), beta, C.getID(mRS), 0, 0, 0, 0);
   2759     }
   2760 
   2761     /**
   2762      * CSYR2K performs one of the symmetric rank 2k operations
   2763      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
   2764      *
   2765      * Details: http://www.netlib.org/lapack/explore-html/de/d7e/csyr2k_8f.html
   2766      *
   2767      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2768      * @param Trans The type of transpose applied to the operation.
   2769      * @param alpha The scalar alpha.
   2770      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2771      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   2772      * @param beta The scalar beta.
   2773      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   2774      */
   2775     public void CSYR2K(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C) {
   2776         validateUplo(Uplo);
   2777         validateSYR2K(Element.F32_2(mRS), Trans, A, B, C);
   2778         int K = -1;
   2779         if (Trans != NO_TRANSPOSE) {
   2780             K = A.getType().getY();
   2781         } else {
   2782             K = A.getType().getX();
   2783         }
   2784         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   2785     }
   2786 
   2787     /**
   2788      * ZSYR2K performs one of the symmetric rank 2k operations
   2789      * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
   2790      *
   2791      * Details: http://www.netlib.org/lapack/explore-html/df/d20/zsyr2k_8f.html
   2792      *
   2793      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   2794      * @param Trans The type of transpose applied to the operation.
   2795      * @param alpha The scalar alpha.
   2796      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2797      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   2798      * @param beta The scalar beta.
   2799      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   2800      */
   2801     public void ZSYR2K(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C) {
   2802         validateUplo(Uplo);
   2803         validateSYR2K(Element.F64_2(mRS), Trans, A, B, C);
   2804         int K = -1;
   2805         if (Trans != NO_TRANSPOSE) {
   2806             K = A.getType().getY();
   2807         } else {
   2808             K = A.getType().getX();
   2809         }
   2810         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zsyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   2811     }
   2812 
   2813     static void validateTRMM(Element e, @Side int Side, @Transpose int TransA, Allocation A, Allocation B) {
   2814         validateSide(Side);
   2815         validateTranspose(TransA);
   2816         int aM = -1, aN = -1, bM = -1, bN = -1;
   2817         if (!A.getType().getElement().isCompatible(e) ||
   2818             !B.getType().getElement().isCompatible(e)) {
   2819             throw new RSRuntimeException("Called BLAS with wrong Element type");
   2820         }
   2821 
   2822         aM = A.getType().getY();
   2823         aN = A.getType().getX();
   2824         if (aM != aN) {
   2825             throw new RSRuntimeException("Called TRMM with a non-symmetric matrix A");
   2826         }
   2827 
   2828         bM = B.getType().getY();
   2829         bN = B.getType().getX();
   2830         if (Side == LEFT) {
   2831             if (aN != bM) {
   2832                 throw new RSRuntimeException("Called TRMM with invalid matrices");
   2833             }
   2834         } else {
   2835             if (bN != aM) {
   2836                 throw new RSRuntimeException("Called TRMM with invalid matrices");
   2837             }
   2838         }
   2839     }
   2840 
   2841     /**
   2842      * STRMM performs one of the matrix-matrix operations
   2843      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
   2844      * op(A) is one of  op(A) = A  or  op(A) = A**T
   2845      *
   2846      * Details: http://www.netlib.org/lapack/explore-html/df/d01/strmm_8f.html
   2847      *
   2848      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2849      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   2850      * @param TransA The type of transpose applied to matrix A.
   2851      * @param Diag Specifies whether or not A is unit triangular.
   2852      * @param alpha The scalar alpha.
   2853      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   2854      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
   2855      */
   2856     public void STRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B) {
   2857         validateUplo(Uplo);
   2858         validateDiag(Diag);
   2859         validateTRMM(Element.F32(mRS), Side, TransA, A, B);
   2860         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   2861                                         alpha, A.getID(mRS), B.getID(mRS), 0.f, 0, 0, 0, 0, 0);
   2862     }
   2863 
   2864     /**
   2865      * DTRMM performs one of the matrix-matrix operations
   2866      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
   2867      * op(A) is one of  op(A) = A  or  op(A) = A**T
   2868      *
   2869      * Details: http://www.netlib.org/lapack/explore-html/dd/d19/dtrmm_8f.html
   2870      *
   2871      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2872      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   2873      * @param TransA The type of transpose applied to matrix A.
   2874      * @param Diag Specifies whether or not A is unit triangular.
   2875      * @param alpha The scalar alpha.
   2876      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   2877      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
   2878      */
   2879     public void DTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B) {
   2880         validateUplo(Uplo);
   2881         validateDiag(Diag);
   2882         validateTRMM(Element.F64(mRS), Side, TransA, A, B);
   2883         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   2884                                         alpha, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0);
   2885     }
   2886 
   2887     /**
   2888      * CTRMM performs one of the matrix-matrix operations
   2889      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
   2890      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
   2891      *
   2892      * Details: http://www.netlib.org/lapack/explore-html/d4/d9b/ctrmm_8f.html
   2893      *
   2894      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2895      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   2896      * @param TransA The type of transpose applied to matrix A.
   2897      * @param Diag Specifies whether or not A is unit triangular.
   2898      * @param alpha The scalar alpha.
   2899      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   2900      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   2901      */
   2902     public void CTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B) {
   2903         validateUplo(Uplo);
   2904         validateDiag(Diag);
   2905         validateTRMM(Element.F32_2(mRS), Side, TransA, A, B);
   2906         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   2907                                          alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
   2908     }
   2909 
   2910     /**
   2911      * ZTRMM performs one of the matrix-matrix operations
   2912      * B := alpha*op(A)*B   or   B := alpha*B*op(A)
   2913      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
   2914      *
   2915      * Details: http://www.netlib.org/lapack/explore-html/d8/de1/ztrmm_8f.html
   2916      *
   2917      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2918      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   2919      * @param TransA The type of transpose applied to matrix A.
   2920      * @param Diag Specifies whether or not A is unit triangular.
   2921      * @param alpha The scalar alpha.
   2922      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   2923      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   2924      */
   2925     public void ZTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B) {
   2926         validateUplo(Uplo);
   2927         validateDiag(Diag);
   2928         validateTRMM(Element.F64_2(mRS), Side, TransA, A, B);
   2929         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   2930                                    alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
   2931     }
   2932 
   2933     static void validateTRSM(Element e, @Side int Side, @Transpose int TransA, Allocation A, Allocation B) {
   2934         int adim = -1, bM = -1, bN = -1;
   2935         validateSide(Side);
   2936         validateTranspose(TransA);
   2937         if (!A.getType().getElement().isCompatible(e) ||
   2938             !B.getType().getElement().isCompatible(e)) {
   2939             throw new RSRuntimeException("Called BLAS with wrong Element type");
   2940         }
   2941         adim = A.getType().getX();
   2942         if (adim != A.getType().getY()) {
   2943             // this may be unnecessary, the restriction could potentially be relaxed
   2944             // A needs to contain at least that symmetric matrix but could theoretically be larger
   2945             // for now we assume adapters are sufficient, will reevaluate in the future
   2946             throw new RSRuntimeException("Called TRSM with a non-symmetric matrix A");
   2947         }
   2948         bM = B.getType().getY();
   2949         bN = B.getType().getX();
   2950         if (Side == LEFT) {
   2951             // A is M*M
   2952             if (adim != bM) {
   2953                 throw new RSRuntimeException("Called TRSM with invalid matrix dimensions");
   2954             }
   2955         } else {
   2956             // A is N*N
   2957             if (adim != bN) {
   2958                 throw new RSRuntimeException("Called TRSM with invalid matrix dimensions");
   2959             }
   2960         }
   2961     }
   2962 
   2963     /**
   2964      * STRSM solves one of the matrix equations
   2965      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
   2966      * op(A) is one of  op(A) = A  or  op(A) = A**T
   2967      *
   2968      * Details: http://www.netlib.org/lapack/explore-html/d2/d8b/strsm_8f.html
   2969      *
   2970      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2971      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   2972      * @param TransA The type of transpose applied to matrix A.
   2973      * @param Diag Specifies whether or not A is unit triangular.
   2974      * @param alpha The scalar alpha.
   2975      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
   2976      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
   2977      */
   2978     public void STRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B) {
   2979         validateUplo(Uplo);
   2980         validateDiag(Diag);
   2981         validateTRSM(Element.F32(mRS), Side, TransA, A, B);
   2982         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   2983                                         alpha, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0);
   2984     }
   2985 
   2986     /**
   2987      * DTRSM solves one of the matrix equations
   2988      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
   2989      * op(A) is one of  op(A) = A  or  op(A) = A**T
   2990      *
   2991      * Details: http://www.netlib.org/lapack/explore-html/de/da7/dtrsm_8f.html
   2992      *
   2993      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   2994      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   2995      * @param TransA The type of transpose applied to matrix A.
   2996      * @param Diag Specifies whether or not A is unit triangular.
   2997      * @param alpha The scalar alpha.
   2998      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
   2999      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
   3000      */
   3001     public void DTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B) {
   3002         validateUplo(Uplo);
   3003         validateDiag(Diag);
   3004         validateTRSM(Element.F64(mRS), Side, TransA, A, B);
   3005         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   3006                                         alpha, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0);
   3007     }
   3008 
   3009     /**
   3010      * CTRSM solves one of the matrix equations
   3011      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
   3012      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
   3013      *
   3014      * Details: http://www.netlib.org/lapack/explore-html/de/d30/ctrsm_8f.html
   3015      *
   3016      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   3017      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   3018      * @param TransA The type of transpose applied to matrix A.
   3019      * @param Diag Specifies whether or not A is unit triangular.
   3020      * @param alpha The scalar alpha.
   3021      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   3022      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   3023      */
   3024     public void CTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B) {
   3025         validateUplo(Uplo);
   3026         validateDiag(Diag);
   3027         validateTRSM(Element.F32_2(mRS), Side, TransA, A, B);
   3028         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   3029                                          alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
   3030     }
   3031 
   3032     /**
   3033      * ZTRSM solves one of the matrix equations
   3034      * op(A)*X := alpha*B   or   X*op(A) := alpha*B
   3035      * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
   3036      *
   3037      * Details: http://www.netlib.org/lapack/explore-html/d1/d39/ztrsm_8f.html
   3038      *
   3039      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   3040      * @param Uplo Specifies whether matrix A is upper or lower triangular.
   3041      * @param TransA The type of transpose applied to matrix A.
   3042      * @param Diag Specifies whether or not A is unit triangular.
   3043      * @param alpha The scalar alpha.
   3044      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   3045      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   3046      */
   3047     public void ZTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B) {
   3048         validateUplo(Uplo);
   3049         validateDiag(Diag);
   3050         validateTRSM(Element.F64_2(mRS), Side, TransA, A, B);
   3051         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
   3052                                    alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), 0, 0, 0, 0, 0, 0, 0);
   3053     }
   3054 
   3055     static void validateHEMM(Element e, @Side int Side, Allocation A, Allocation B, Allocation C) {
   3056         validateSide(Side);
   3057 
   3058         if (!A.getType().getElement().isCompatible(e) ||
   3059             !B.getType().getElement().isCompatible(e) ||
   3060             !C.getType().getElement().isCompatible(e)) {
   3061             throw new RSRuntimeException("Called BLAS with wrong Element type");
   3062         }
   3063 
   3064         // A must be square; can potentially be relaxed similar to TRSM
   3065         int adim = A.getType().getX();
   3066         if (adim != A.getType().getY()) {
   3067             throw new RSRuntimeException("Called HEMM with non-square A");
   3068         }
   3069         if ((Side == LEFT && adim != B.getType().getY()) ||
   3070             (Side == RIGHT && adim != B.getType().getX())) {
   3071             throw new RSRuntimeException("Called HEMM with invalid B");
   3072         }
   3073         if (B.getType().getX() != C.getType().getX() ||
   3074             B.getType().getY() != C.getType().getY()) {
   3075             throw new RSRuntimeException("Called HEMM with mismatched B and C");
   3076         }
   3077     }
   3078 
   3079     /**
   3080      * CHEMM performs one of the matrix-matrix operations
   3081      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
   3082      *
   3083      * Details: http://www.netlib.org/lapack/explore-html/d3/d66/chemm_8f.html
   3084      *
   3085      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   3086      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   3087      * @param alpha The scalar alpha.
   3088      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   3089      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   3090      * @param beta The scalar beta.
   3091      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   3092      */
   3093     public void CHEMM(@Side int Side, @Uplo int Uplo, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C) {
   3094         validateUplo(Uplo);
   3095         validateHEMM(Element.F32_2(mRS), Side, A, B, C);
   3096         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chemm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0,
   3097                                          alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   3098     }
   3099 
   3100     /**
   3101      * ZHEMM performs one of the matrix-matrix operations
   3102      * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
   3103      *
   3104      * Details: http://www.netlib.org/lapack/explore-html/d6/d3e/zhemm_8f.html
   3105      *
   3106      * @param Side Specifies whether the symmetric matrix A appears on the left or right.
   3107      * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
   3108      * @param alpha The scalar alpha.
   3109      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   3110      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   3111      * @param beta The scalar beta.
   3112      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   3113      */
   3114     public void ZHEMM(@Side int Side, @Uplo int Uplo, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C) {
   3115         validateUplo(Uplo);
   3116         validateHEMM(Element.F64_2(mRS), Side, A, B, C);
   3117         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhemm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0,
   3118                                    alpha.x, alpha.y, A.getID(mRS), B.getID(mRS), beta.x, beta.y, C.getID(mRS), 0, 0, 0, 0);
   3119     }
   3120 
   3121     static void validateHERK(Element e, @Transpose int Trans, Allocation A, Allocation C) {
   3122         if (!A.getType().getElement().isCompatible(e) ||
   3123             !C.getType().getElement().isCompatible(e)) {
   3124             throw new RSRuntimeException("Called BLAS with wrong Element type");
   3125         }
   3126         validateConjTranspose(Trans);
   3127         int cdim = C.getType().getX();
   3128         if (cdim != C.getType().getY()) {
   3129             throw new RSRuntimeException("Called HERK with non-square C");
   3130         }
   3131         if (Trans == NO_TRANSPOSE) {
   3132             if (cdim != A.getType().getY()) {
   3133                 throw new RSRuntimeException("Called HERK with invalid A");
   3134             }
   3135         } else {
   3136             if (cdim != A.getType().getX()) {
   3137                 throw new RSRuntimeException("Called HERK with invalid A");
   3138             }
   3139         }
   3140     }
   3141 
   3142     /**
   3143      * CHERK performs one of the hermitian rank k operations
   3144      * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
   3145      *
   3146      * Details: http://www.netlib.org/lapack/explore-html/d8/d52/cherk_8f.html
   3147      *
   3148      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   3149      * @param Trans The type of transpose applied to the operation.
   3150      * @param alpha The scalar alpha.
   3151      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   3152      * @param beta The scalar beta.
   3153      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   3154      */
   3155     public void CHERK(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C) {
   3156         validateUplo(Uplo);
   3157         validateHERK(Element.F32_2(mRS), Trans, A, C);
   3158         int k = 0;
   3159         if (Trans == CONJ_TRANSPOSE) {
   3160             k = A.getType().getY();
   3161         } else {
   3162             k = A.getType().getX();
   3163         }
   3164         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cherk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k,
   3165                                          alpha, 0, A.getID(mRS), 0, beta, 0, C.getID(mRS), 0, 0, 0, 0);
   3166     }
   3167 
   3168     /**
   3169      * ZHERK performs one of the hermitian rank k operations
   3170      * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
   3171      *
   3172      * Details: http://www.netlib.org/lapack/explore-html/d1/db1/zherk_8f.html
   3173      *
   3174      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   3175      * @param Trans The type of transpose applied to the operation.
   3176      * @param alpha The scalar alpha.
   3177      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   3178      * @param beta The scalar beta.
   3179      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   3180      */
   3181     public void ZHERK(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C) {
   3182         validateUplo(Uplo);
   3183         validateHERK(Element.F64_2(mRS), Trans, A, C);
   3184         int k = 0;
   3185         if (Trans == CONJ_TRANSPOSE) {
   3186             k = A.getType().getY();
   3187         } else {
   3188             k = A.getType().getX();
   3189         }
   3190         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zherk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k,
   3191                                    alpha, 0, A.getID(mRS), 0, beta, 0, C.getID(mRS), 0, 0, 0, 0);
   3192     }
   3193 
   3194     static void validateHER2K(Element e, @Transpose int Trans, Allocation A, Allocation B, Allocation C) {
   3195         if (!A.getType().getElement().isCompatible(e) ||
   3196             !B.getType().getElement().isCompatible(e) ||
   3197             !C.getType().getElement().isCompatible(e)) {
   3198             throw new RSRuntimeException("Called BLAS with wrong Element type");
   3199         }
   3200         validateConjTranspose(Trans);
   3201         int cdim = C.getType().getX();
   3202         if (cdim != C.getType().getY()) {
   3203             throw new RSRuntimeException("Called HER2K with non-square C");
   3204         }
   3205         if (Trans == NO_TRANSPOSE) {
   3206             if (A.getType().getY() != cdim) {
   3207                 throw new RSRuntimeException("Called HER2K with invalid matrices");
   3208             }
   3209         } else {
   3210             if (A.getType().getX() != cdim) {
   3211                 throw new RSRuntimeException("Called HER2K with invalid matrices");
   3212             }
   3213         }
   3214         if (A.getType().getX() != B.getType().getX() || A.getType().getY() != B.getType().getY()) {
   3215             throw new RSRuntimeException("Called HER2K with invalid A and B matrices");
   3216         }
   3217     }
   3218 
   3219     /**
   3220      * CHER2K performs one of the hermitian rank 2k operations
   3221      * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
   3222      *
   3223      * Details: http://www.netlib.org/lapack/explore-html/d1/d82/cher2k_8f.html
   3224      *
   3225      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   3226      * @param Trans The type of transpose applied to the operation.
   3227      * @param alpha The scalar alpha.
   3228      * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
   3229      * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
   3230      * @param beta The scalar beta.
   3231      * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
   3232      */
   3233     public void CHER2K(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, float beta, Allocation C) {
   3234         validateUplo(Uplo);
   3235         validateHER2K(Element.F32_2(mRS), Trans, A, B, C);
   3236         int k = 0;
   3237         if (Trans == NO_TRANSPOSE) {
   3238             k = A.getType().getX();
   3239         } else {
   3240             k = A.getType().getY();
   3241         }
   3242         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k, alpha.x, alpha.y,
   3243                                          A.getID(mRS), B.getID(mRS), beta, 0, C.getID(mRS), 0, 0, 0, 0);
   3244     }
   3245 
   3246     /**
   3247      * ZHER2K performs one of the hermitian rank 2k operations
   3248      * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
   3249      *
   3250      * Details: http://www.netlib.org/lapack/explore-html/d7/dfa/zher2k_8f.html
   3251      *
   3252      * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
   3253      * @param Trans The type of transpose applied to the operation.
   3254      * @param alpha The scalar alpha.
   3255      * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
   3256      * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
   3257      * @param beta The scalar beta.
   3258      * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
   3259      */
   3260     public void ZHER2K(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, double beta, Allocation C) {
   3261         validateUplo(Uplo);
   3262         validateHER2K(Element.F64_2(mRS), Trans, A, B, C);
   3263         int k = 0;
   3264         if (Trans == NO_TRANSPOSE) {
   3265             k = A.getType().getX();
   3266         } else {
   3267             k = A.getType().getY();
   3268         }
   3269         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k, alpha.x, alpha.y,
   3270                                    A.getID(mRS), B.getID(mRS), beta, 0, C.getID(mRS), 0, 0, 0, 0);
   3271     }
   3272 
   3273 
   3274     /**
   3275      * 8-bit GEMM-like operation for neural networks: C = A * Transpose(B)
   3276      * Calculations are done in 1.10.21 fixed-point format for the final output,
   3277      * just before there's a shift down to drop the fractional parts. The output
   3278      * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
   3279      * gives some headroom to avoid wrapping around on small overflows.
   3280      *
   3281      * @param A The input allocation contains matrix A, supported elements type {@link Element#U8}.
   3282      * @param a_offset The offset for all values in matrix A, e.g A[i,j] = A[i,j] - a_offset. Value should be from 0 to 255.
   3283      * @param B The input allocation contains matrix B, supported elements type {@link Element#U8}.
   3284      * @param b_offset The offset for all values in matrix B, e.g B[i,j] = B[i,j] - b_offset. Value should be from 0 to 255.
   3285      * @param C The input allocation contains matrix C, supported elements type {@link Element#U8}.
   3286      * @param c_offset The offset for all values in matrix C.
   3287      * @param c_mult The multiplier for all values in matrix C, e.g C[i,j] = (C[i,j] + c_offset) * c_mult.
   3288      **/
   3289     public void BNNM(Allocation A, int a_offset, Allocation B, int b_offset, Allocation C, int c_offset, int c_mult) {
   3290         validateL3(Element.U8(mRS), NO_TRANSPOSE, TRANSPOSE, 0, A, B, C);
   3291 
   3292         if (a_offset < 0 || a_offset > 255) {
   3293             throw new RSRuntimeException("Invalid a_offset passed to BNNM");
   3294         }
   3295         if (b_offset < 0 || b_offset > 255) {
   3296             throw new RSRuntimeException("Invalid b_offset passed to BNNM");
   3297         }
   3298         int M = -1, N = -1, K = -1;
   3299         M = A.getType().getY();
   3300         N = B.getType().getY();
   3301         K = A.getType().getX();
   3302 
   3303 
   3304         mRS.nScriptIntrinsicBLAS_BNNM(getID(mRS), M, N, K, A.getID(mRS), a_offset, B.getID(mRS), b_offset, C.getID(mRS), c_offset, c_mult);
   3305 
   3306     }
   3307 
   3308 }
   3309