Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Exposes the family of BLAS routines as pre-canned high performance calls for
     17 // use in conjunction with the StreamExecutor abstraction.
     18 //
     19 // Note that this interface is optionally supported by platforms; see
     20 // StreamExecutor::SupportsBlas() for details.
     21 //
     22 // This abstraction makes it simple to entrain BLAS operations on GPU data into
     23 // a Stream -- users typically will not use this API directly, but will use the
     24 // Stream builder methods to entrain these operations "under the hood". For
     25 // example:
     26 //
     27 //  DeviceMemory<float> x = stream_exec->AllocateArray<float>(1024);
     28 //  DeviceMemory<float> y = stream_exec->AllocateArray<float>(1024);
     29 //  // ... populate x and y ...
     30 //  Stream stream{stream_exec};
     31 //  stream
     32 //    .Init()
     33 //    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1);
     34 //  SE_CHECK_OK(stream.BlockHostUntilDone());
     35 //
     36 // By using stream operations in this manner the user can easily intermix custom
     37 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS
     38 // routines.
     39 
     40 #ifndef TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
     41 #define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
     42 
     43 #include <complex>
     44 #include "tensorflow/stream_executor/platform/port.h"
     45 
     46 #include "tensorflow/stream_executor/lib/array_slice.h"
     47 
     48 namespace Eigen {
     49 struct half;
     50 }  // namespace Eigen
     51 
     52 namespace perftools {
     53 namespace gputools {
     54 
     55 class Stream;
     56 class ScratchAllocator;
     57 
     58 template <typename ElemT>
     59 class DeviceMemory;
     60 
     61 namespace blas {
     62 
     63 // Specifies whether the input matrix will be transposed or
     64 // transposed+conjugated before any BLAS operations.
     65 enum class Transpose { kNoTranspose, kTranspose, kConjugateTranspose };
     66 
     67 // Returns a name for t.
     68 string TransposeString(Transpose t);
     69 
     70 // Specifies whether the upper or lower triangular part of a
     71 // symmetric/Hermitian matrix is used.
     72 enum class UpperLower { kUpper, kLower };
     73 
     74 // Returns a name for ul.
     75 string UpperLowerString(UpperLower ul);
     76 
     77 // Specifies whether a matrix is unit triangular.
     78 enum class Diagonal { kUnit, kNonUnit };
     79 
     80 // Returns a name for d.
     81 string DiagonalString(Diagonal d);
     82 
     83 // Specifies whether a Hermitian matrix appears on the left or right in
     84 // operation.
     85 enum class Side { kLeft, kRight };
     86 
     87 // Returns a name for s.
     88 string SideString(Side s);
     89 
     90 // Type with which intermediate computations of a blas routine are performed.
     91 //
     92 // Some blas calls can perform computations with a type that's different than
     93 // the type of their inputs/outputs.  This lets you e.g. multiply two matricies
     94 // of int8s using float32s to store the matmul's intermediate values.
     95 enum class ComputationType {
     96   kF16,         // 16-bit floating-point
     97   kF32,         // 32-bit floating-point
     98   kF64,         // 64-bit floating-point
     99   kI32,         // 32-bit integer
    100   kComplexF32,  // Complex number comprised of two f32s.
    101   kComplexF64,  // Complex number comprised of two f64s.
    102 };
    103 
    104 // Converts a ComputationType to a string.
    105 string ComputationTypeString(ComputationType ty);
    106 
    107 // Opaque identifier for an "algorithm" used by a blas routine.  This functions
    108 // as a hint to the blas library.
    109 typedef int64 AlgorithmType;
    110 constexpr AlgorithmType kDefaultAlgorithm = -1;
    111 constexpr AlgorithmType kDefaultBlasGemm = -2;
    112 constexpr AlgorithmType kDefaultBlasGemv = -3;
    113 constexpr AlgorithmType kNoAlgorithm = -4;
    114 
    115 // blas uses -1 to represent the default algorithm. This happens to match up
    116 // with the CUBLAS_GEMM_DFALT constant, so cuda_blas.cc is using static_cast
    117 // to convert from AlgorithmType to cublasGemmAlgo_t, and uses a static_assert
    118 // to ensure that this assumption does not break.
    119 // If another blas implementation uses a different value for the default
    120 // algorithm, then it needs to convert kDefaultGemmAlgo to that value
    121 // (e.g. via a function called ToWhateverGemmAlgo).
    122 constexpr AlgorithmType kDefaultGemmAlgo = -1;
    123 
    124 // Describes the result of a performance experiment, usually timing the speed of
    125 // a particular AlgorithmType.
    126 //
    127 // If the call we were benchmarking failed (a common occurrence; not all
    128 // algorithms are valid for all calls), is_valid() will be false.
    129 class ProfileResult {
    130  public:
    131   bool is_valid() const { return is_valid_; }
    132   void set_is_valid(bool val) { is_valid_ = val; }
    133   AlgorithmType algorithm() const { return algorithm_; }
    134   void set_algorithm(AlgorithmType val) { algorithm_ = val; }
    135   float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
    136   void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
    137 
    138  private:
    139   bool is_valid_ = false;
    140   AlgorithmType algorithm_ = kDefaultAlgorithm;
    141   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
    142 };
    143 
    144 class AlgorithmConfig {
    145  public:
    146   AlgorithmConfig() : algorithm_(kDefaultAlgorithm) {}
    147   explicit AlgorithmConfig(AlgorithmType algorithm) : algorithm_(algorithm) {}
    148   AlgorithmType algorithm() const { return algorithm_; }
    149   void set_algorithm(AlgorithmType val) { algorithm_ = val; }
    150   bool operator==(const AlgorithmConfig &other) const {
    151     return this->algorithm_ == other.algorithm_;
    152   }
    153   bool operator!=(const AlgorithmConfig &other) const {
    154     return !(*this == other);
    155   }
    156   string ToString() const;
    157 
    158  private:
    159   AlgorithmType algorithm_;
    160 };
    161 
    162 // BLAS support interface -- this can be derived from a GPU executor when the
    163 // underlying platform has an BLAS library implementation available. See
    164 // StreamExecutor::AsBlas().
    165 //
    166 // Thread-hostile: CUDA associates a CUDA-context with a particular thread in
    167 // the system. Any operation that a user attempts to perform by enqueueing BLAS
    168 // operations on a thread not-associated with the CUDA-context has unknown
    169 // behavior at the current time; see b/13176597
    170 class BlasSupport {
    171  public:
    172   virtual ~BlasSupport() {}
    173 
    174   // Computes the sum of magnitudes of the vector elements.
    175   // result <- |Re x(1)| + |Im x(1)| + |Re  x(2)| + |Im  x(2)|+ ... + |Re  x(n)|
    176   // + |Im x(n)|.
    177   // Note that Im x(i) = 0 for real types float/double.
    178   virtual bool DoBlasAsum(Stream *stream, uint64 elem_count,
    179                           const DeviceMemory<float> &x, int incx,
    180                           DeviceMemory<float> *result) = 0;
    181   virtual bool DoBlasAsum(Stream *stream, uint64 elem_count,
    182                           const DeviceMemory<double> &x, int incx,
    183                           DeviceMemory<double> *result) = 0;
    184   virtual bool DoBlasAsum(Stream *stream, uint64 elem_count,
    185                           const DeviceMemory<std::complex<float>> &x, int incx,
    186                           DeviceMemory<float> *result) = 0;
    187   virtual bool DoBlasAsum(Stream *stream, uint64 elem_count,
    188                           const DeviceMemory<std::complex<double>> &x, int incx,
    189                           DeviceMemory<double> *result) = 0;
    190 
    191   // Performs a BLAS y <- ax+y operation.
    192   virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
    193                           const DeviceMemory<float> &x, int incx,
    194                           DeviceMemory<float> *y, int incy) = 0;
    195   virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
    196                           const DeviceMemory<double> &x, int incx,
    197                           DeviceMemory<double> *y, int incy) = 0;
    198   virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count,
    199                           std::complex<float> alpha,
    200                           const DeviceMemory<std::complex<float>> &x, int incx,
    201                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    202   virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count,
    203                           std::complex<double> alpha,
    204                           const DeviceMemory<std::complex<double>> &x, int incx,
    205                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    206 
    207   // Copies vector to another vector: y <- x.
    208   virtual bool DoBlasCopy(Stream *stream, uint64 elem_count,
    209                           const DeviceMemory<float> &x, int incx,
    210                           DeviceMemory<float> *y, int incy) = 0;
    211   virtual bool DoBlasCopy(Stream *stream, uint64 elem_count,
    212                           const DeviceMemory<double> &x, int incx,
    213                           DeviceMemory<double> *y, int incy) = 0;
    214   virtual bool DoBlasCopy(Stream *stream, uint64 elem_count,
    215                           const DeviceMemory<std::complex<float>> &x, int incx,
    216                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    217   virtual bool DoBlasCopy(Stream *stream, uint64 elem_count,
    218                           const DeviceMemory<std::complex<double>> &x, int incx,
    219                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    220 
    221   // Performs a BLAS dot product result <- x . y.
    222   virtual bool DoBlasDot(Stream *stream, uint64 elem_count,
    223                          const DeviceMemory<float> &x, int incx,
    224                          const DeviceMemory<float> &y, int incy,
    225                          DeviceMemory<float> *result) = 0;
    226   virtual bool DoBlasDot(Stream *stream, uint64 elem_count,
    227                          const DeviceMemory<double> &x, int incx,
    228                          const DeviceMemory<double> &y, int incy,
    229                          DeviceMemory<double> *result) = 0;
    230 
    231   // Performs a BLAS dot product result <- conj(x) . y for complex types.
    232   virtual bool DoBlasDotc(Stream *stream, uint64 elem_count,
    233                           const DeviceMemory<std::complex<float>> &x, int incx,
    234                           const DeviceMemory<std::complex<float>> &y, int incy,
    235                           DeviceMemory<std::complex<float>> *result) = 0;
    236   virtual bool DoBlasDotc(Stream *stream, uint64 elem_count,
    237                           const DeviceMemory<std::complex<double>> &x, int incx,
    238                           const DeviceMemory<std::complex<double>> &y, int incy,
    239                           DeviceMemory<std::complex<double>> *result) = 0;
    240 
    241   // Performs a BLAS dot product result <- x . y for complex types. Note that
    242   // x is unconjugated in this routine.
    243   virtual bool DoBlasDotu(Stream *stream, uint64 elem_count,
    244                           const DeviceMemory<std::complex<float>> &x, int incx,
    245                           const DeviceMemory<std::complex<float>> &y, int incy,
    246                           DeviceMemory<std::complex<float>> *result) = 0;
    247   virtual bool DoBlasDotu(Stream *stream, uint64 elem_count,
    248                           const DeviceMemory<std::complex<double>> &x, int incx,
    249                           const DeviceMemory<std::complex<double>> &y, int incy,
    250                           DeviceMemory<std::complex<double>> *result) = 0;
    251 
    252   // Computes the Euclidean norm of a vector: result <- ||x||.
    253   // See the following link for more information of Euclidean norm:
    254   // http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
    255   virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count,
    256                           const DeviceMemory<float> &x, int incx,
    257                           DeviceMemory<float> *result) = 0;
    258   virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count,
    259                           const DeviceMemory<double> &x, int incx,
    260                           DeviceMemory<double> *result) = 0;
    261   virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count,
    262                           const DeviceMemory<std::complex<float>> &x, int incx,
    263                           DeviceMemory<float> *result) = 0;
    264   virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count,
    265                           const DeviceMemory<std::complex<double>> &x, int incx,
    266                           DeviceMemory<double> *result) = 0;
    267 
    268   // Performs rotation of points in the plane:
    269   // x(i) = c*x(i) + s*y(i)
    270   // y(i) = c*y(i) - s*x(i).
    271   virtual bool DoBlasRot(Stream *stream, uint64 elem_count,
    272                          DeviceMemory<float> *x, int incx,
    273                          DeviceMemory<float> *y, int incy, float c,
    274                          float s) = 0;
    275   virtual bool DoBlasRot(Stream *stream, uint64 elem_count,
    276                          DeviceMemory<double> *x, int incx,
    277                          DeviceMemory<double> *y, int incy, double c,
    278                          double s) = 0;
    279   virtual bool DoBlasRot(Stream *stream, uint64 elem_count,
    280                          DeviceMemory<std::complex<float>> *x, int incx,
    281                          DeviceMemory<std::complex<float>> *y, int incy,
    282                          float c, float s) = 0;
    283   virtual bool DoBlasRot(Stream *stream, uint64 elem_count,
    284                          DeviceMemory<std::complex<double>> *x, int incx,
    285                          DeviceMemory<std::complex<double>> *y, int incy,
    286                          double c, double s) = 0;
    287 
    288   // Computes the parameters for a Givens rotation.
    289   // Given the Cartesian coordinates (a, b) of a point, these routines return
    290   // the parameters c, s, r, and z associated with the Givens rotation. The
    291   // parameters c and s define a unitary matrix such that:
    292   //
    293   //   |  c s |.| a | = | r |
    294   //   | -s c | | b |   | 0 |
    295   //
    296   // The parameter z is defined such that if |a| > |b|, z is s; otherwise if
    297   // c is not 0 z is 1/c; otherwise z is 1.
    298   virtual bool DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
    299                           DeviceMemory<float> *b, DeviceMemory<float> *c,
    300                           DeviceMemory<float> *s) = 0;
    301   virtual bool DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
    302                           DeviceMemory<double> *b, DeviceMemory<double> *c,
    303                           DeviceMemory<double> *s) = 0;
    304   virtual bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
    305                           DeviceMemory<std::complex<float>> *b,
    306                           DeviceMemory<float> *c,
    307                           DeviceMemory<std::complex<float>> *s) = 0;
    308   virtual bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
    309                           DeviceMemory<std::complex<double>> *b,
    310                           DeviceMemory<double> *c,
    311                           DeviceMemory<std::complex<double>> *s) = 0;
    312 
    313   // Performs modified Givens rotation of points in the plane.
    314   // Given two vectors x and y, each vector element of these vectors is replaced
    315   // as follows:
    316   //
    317   //   | x(i) | =  H | x(i) |
    318   //   | y(i) |      | y(i) |
    319   //
    320   // for i=1 to n, where H is a modified Givens transformation matrix whose
    321   // values are stored in the param[1] through param[4] array.
    322   // For more information please Google this routine.
    323   virtual bool DoBlasRotm(Stream *stream, uint64 elem_count,
    324                           DeviceMemory<float> *x, int incx,
    325                           DeviceMemory<float> *y, int incy,
    326                           const DeviceMemory<float> &param) = 0;
    327   virtual bool DoBlasRotm(Stream *stream, uint64 elem_count,
    328                           DeviceMemory<double> *x, int incx,
    329                           DeviceMemory<double> *y, int incy,
    330                           const DeviceMemory<double> &param) = 0;
    331 
    332   // Computes the parameters for a modified Givens rotation.
    333   // Given Cartesian coordinates (x1, y1) of an input vector, these routines
    334   // compute the components of a modified Givens transformation matrix H that
    335   // zeros the y-component of the resulting vector:
    336   //
    337   //   | x1 | =  H | x1 * sqrt(d1) |
    338   //   |  0 |      | y1 * sqrt(d1) |
    339   //
    340   // For more information please Google this routine.
    341   virtual bool DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
    342                            DeviceMemory<float> *d2, DeviceMemory<float> *x1,
    343                            const DeviceMemory<float> &y1,
    344                            DeviceMemory<float> *param) = 0;
    345   virtual bool DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
    346                            DeviceMemory<double> *d2, DeviceMemory<double> *x1,
    347                            const DeviceMemory<double> &y1,
    348                            DeviceMemory<double> *param) = 0;
    349 
    350   // Computes the product of a vector by a scalar: x <- a*x.
    351   virtual bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
    352                           DeviceMemory<float> *x, int incx) = 0;
    353   virtual bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
    354                           DeviceMemory<double> *x, int incx) = 0;
    355   virtual bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
    356                           DeviceMemory<std::complex<float>> *x, int incx) = 0;
    357   virtual bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
    358                           DeviceMemory<std::complex<double>> *x, int incx) = 0;
    359   virtual bool DoBlasScal(Stream *stream, uint64 elem_count,
    360                           std::complex<float> alpha,
    361                           DeviceMemory<std::complex<float>> *x, int incx) = 0;
    362   virtual bool DoBlasScal(Stream *stream, uint64 elem_count,
    363                           std::complex<double> alpha,
    364                           DeviceMemory<std::complex<double>> *x, int incx) = 0;
    365 
    366   // Swaps a vector with another vector.
    367   virtual bool DoBlasSwap(Stream *stream, uint64 elem_count,
    368                           DeviceMemory<float> *x, int incx,
    369                           DeviceMemory<float> *y, int incy) = 0;
    370   virtual bool DoBlasSwap(Stream *stream, uint64 elem_count,
    371                           DeviceMemory<double> *x, int incx,
    372                           DeviceMemory<double> *y, int incy) = 0;
    373   virtual bool DoBlasSwap(Stream *stream, uint64 elem_count,
    374                           DeviceMemory<std::complex<float>> *x, int incx,
    375                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    376   virtual bool DoBlasSwap(Stream *stream, uint64 elem_count,
    377                           DeviceMemory<std::complex<double>> *x, int incx,
    378                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    379 
    380   // Finds the index of the element with maximum absolute value.
    381   virtual bool DoBlasIamax(Stream *stream, uint64 elem_count,
    382                            const DeviceMemory<float> &x, int incx,
    383                            DeviceMemory<int> *result) = 0;
    384   virtual bool DoBlasIamax(Stream *stream, uint64 elem_count,
    385                            const DeviceMemory<double> &x, int incx,
    386                            DeviceMemory<int> *result) = 0;
    387   virtual bool DoBlasIamax(Stream *stream, uint64 elem_count,
    388                            const DeviceMemory<std::complex<float>> &x, int incx,
    389                            DeviceMemory<int> *result) = 0;
    390   virtual bool DoBlasIamax(Stream *stream, uint64 elem_count,
    391                            const DeviceMemory<std::complex<double>> &x,
    392                            int incx, DeviceMemory<int> *result) = 0;
    393 
    394   // Finds the index of the element with minimum absolute value.
    395   virtual bool DoBlasIamin(Stream *stream, uint64 elem_count,
    396                            const DeviceMemory<float> &x, int incx,
    397                            DeviceMemory<int> *result) = 0;
    398   virtual bool DoBlasIamin(Stream *stream, uint64 elem_count,
    399                            const DeviceMemory<double> &x, int incx,
    400                            DeviceMemory<int> *result) = 0;
    401   virtual bool DoBlasIamin(Stream *stream, uint64 elem_count,
    402                            const DeviceMemory<std::complex<float>> &x, int incx,
    403                            DeviceMemory<int> *result) = 0;
    404   virtual bool DoBlasIamin(Stream *stream, uint64 elem_count,
    405                            const DeviceMemory<std::complex<double>> &x,
    406                            int incx, DeviceMemory<int> *result) = 0;
    407 
    408   // Computes a matrix-vector product using a general band matrix:
    409   //
    410   //     y <- alpha * a * x + beta * y,
    411   // or
    412   //     y <- alpha * a' * x + beta * y,
    413   // or
    414   //     y <- alpha * conj(a') * x + beta * y,
    415   //
    416   // alpha and beta are scalars; a is an m-by-n general band matrix, with kl
    417   // sub-diagonals and ku super-diagonals; x is a vector with
    418   // n(trans==kNoTranspose)/m(otherwise) elements;
    419   // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements.
    420   virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    421                           uint64 n, uint64 kl, uint64 ku, float alpha,
    422                           const DeviceMemory<float> &a, int lda,
    423                           const DeviceMemory<float> &x, int incx, float beta,
    424                           DeviceMemory<float> *y, int incy) = 0;
    425   virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    426                           uint64 n, uint64 kl, uint64 ku, double alpha,
    427                           const DeviceMemory<double> &a, int lda,
    428                           const DeviceMemory<double> &x, int incx, double beta,
    429                           DeviceMemory<double> *y, int incy) = 0;
    430   virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    431                           uint64 n, uint64 kl, uint64 ku,
    432                           std::complex<float> alpha,
    433                           const DeviceMemory<std::complex<float>> &a, int lda,
    434                           const DeviceMemory<std::complex<float>> &x, int incx,
    435                           std::complex<float> beta,
    436                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    437   virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    438                           uint64 n, uint64 kl, uint64 ku,
    439                           std::complex<double> alpha,
    440                           const DeviceMemory<std::complex<double>> &a, int lda,
    441                           const DeviceMemory<std::complex<double>> &x, int incx,
    442                           std::complex<double> beta,
    443                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    444 
    445   // Computes a matrix-vector product using a general matrix.
    446   //
    447   //     y <- alpha * a * x + beta * y,
    448   // or
    449   //     y <- alpha * a' * x + beta * y,
    450   // or
    451   //     y <- alpha * conj(a') * x + beta * y,
    452   //
    453   // alpha and beta are scalars; a is an m-by-n general matrix; x is a vector
    454   // with n(trans==kNoTranspose)/m(otherwise) elements;
    455   // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements.
    456   virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    457                           uint64 n, float alpha, const DeviceMemory<float> &a,
    458                           int lda, const DeviceMemory<float> &x, int incx,
    459                           float beta, DeviceMemory<float> *y, int incy) = 0;
    460   virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    461                           uint64 n, double alpha, const DeviceMemory<double> &a,
    462                           int lda, const DeviceMemory<double> &x, int incx,
    463                           double beta, DeviceMemory<double> *y, int incy) = 0;
    464   virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    465                           uint64 n, std::complex<float> alpha,
    466                           const DeviceMemory<std::complex<float>> &a, int lda,
    467                           const DeviceMemory<std::complex<float>> &x, int incx,
    468                           std::complex<float> beta,
    469                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    470   virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    471                           uint64 n, std::complex<double> alpha,
    472                           const DeviceMemory<std::complex<double>> &a, int lda,
    473                           const DeviceMemory<std::complex<double>> &x, int incx,
    474                           std::complex<double> beta,
    475                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    476 
    477   virtual bool DoBlasGemvWithProfiling(
    478       Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
    479       const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
    480       int incx, float beta, DeviceMemory<float> *y, int incy,
    481       ProfileResult *output_profile_result) = 0;
    482   virtual bool DoBlasGemvWithProfiling(
    483       Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
    484       const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
    485       int incx, double beta, DeviceMemory<double> *y, int incy,
    486       ProfileResult *output_profile_result) = 0;
    487   virtual bool DoBlasGemvWithProfiling(
    488       Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
    489       std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
    490       int lda, const DeviceMemory<std::complex<float>> &x, int incx,
    491       std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
    492       ProfileResult *output_profile_result) = 0;
    493   virtual bool DoBlasGemvWithProfiling(
    494       Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
    495       std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
    496       int lda, const DeviceMemory<std::complex<double>> &x, int incx,
    497       std::complex<double> beta, DeviceMemory<std::complex<double>> *y,
    498       int incy, ProfileResult *output_profile_result) = 0;
    499 
    500   // Performs a rank-1 update of a general matrix.
    501   //
    502   //     a <- alpha * x * y' + a,
    503   //
    504   // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is
    505   // an m-by-n general matrix.
    506   virtual bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
    507                          const DeviceMemory<float> &x, int incx,
    508                          const DeviceMemory<float> &y, int incy,
    509                          DeviceMemory<float> *a, int lda) = 0;
    510   virtual bool DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
    511                          const DeviceMemory<double> &x, int incx,
    512                          const DeviceMemory<double> &y, int incy,
    513                          DeviceMemory<double> *a, int lda) = 0;
    514 
    515   // Performs a rank-1 update (conjugated) of a general matrix.
    516   //
    517   //     a <- alpha * x * conj(y') + a,
    518   //
    519   // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is
    520   // an m-by-n general matrix.
    521   virtual bool DoBlasGerc(Stream *stream, uint64 m, uint64 n,
    522                           std::complex<float> alpha,
    523                           const DeviceMemory<std::complex<float>> &x, int incx,
    524                           const DeviceMemory<std::complex<float>> &y, int incy,
    525                           DeviceMemory<std::complex<float>> *a, int lda) = 0;
    526   virtual bool DoBlasGerc(Stream *stream, uint64 m, uint64 n,
    527                           std::complex<double> alpha,
    528                           const DeviceMemory<std::complex<double>> &x, int incx,
    529                           const DeviceMemory<std::complex<double>> &y, int incy,
    530                           DeviceMemory<std::complex<double>> *a, int lda) = 0;
    531 
    532   // Performs a rank-1 update (unconjugated) of a general matrix.
    533   //
    534   //     a <- alpha * x * y' + a,
    535   //
    536   // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is
    537   // an m-by-n general matrix.
    538   virtual bool DoBlasGeru(Stream *stream, uint64 m, uint64 n,
    539                           std::complex<float> alpha,
    540                           const DeviceMemory<std::complex<float>> &x, int incx,
    541                           const DeviceMemory<std::complex<float>> &y, int incy,
    542                           DeviceMemory<std::complex<float>> *a, int lda) = 0;
    543   virtual bool DoBlasGeru(Stream *stream, uint64 m, uint64 n,
    544                           std::complex<double> alpha,
    545                           const DeviceMemory<std::complex<double>> &x, int incx,
    546                           const DeviceMemory<std::complex<double>> &y, int incy,
    547                           DeviceMemory<std::complex<double>> *a, int lda) = 0;
    548 
    549   // Computes a matrix-vector product using a Hermitian band matrix.
    550   //
    551   //     y <- alpha * a * x + beta * y,
    552   //
    553   // alpha and beta are scalars; a is an n-by-n Hermitian band matrix, with k
    554   // super-diagonals; x and y are n-element vectors.
    555   virtual bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    556                           uint64 k, std::complex<float> alpha,
    557                           const DeviceMemory<std::complex<float>> &a, int lda,
    558                           const DeviceMemory<std::complex<float>> &x, int incx,
    559                           std::complex<float> beta,
    560                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    561   virtual bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    562                           uint64 k, std::complex<double> alpha,
    563                           const DeviceMemory<std::complex<double>> &a, int lda,
    564                           const DeviceMemory<std::complex<double>> &x, int incx,
    565                           std::complex<double> beta,
    566                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    567 
    568   // Computes a matrix-vector product using a Hermitian matrix.
    569   //
    570   //     y <- alpha * a * x + beta * y,
    571   //
    572   // alpha and beta are scalars; a is an n-by-n Hermitian matrix; x and y are
    573   // n-element vectors.
    574   virtual bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
    575                           std::complex<float> alpha,
    576                           const DeviceMemory<std::complex<float>> &a, int lda,
    577                           const DeviceMemory<std::complex<float>> &x, int incx,
    578                           std::complex<float> beta,
    579                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    580   virtual bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
    581                           std::complex<double> alpha,
    582                           const DeviceMemory<std::complex<double>> &a, int lda,
    583                           const DeviceMemory<std::complex<double>> &x, int incx,
    584                           std::complex<double> beta,
    585                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    586 
    587   // Performs a rank-1 update of a Hermitian matrix.
    588   //
    589   //     a <- alpha * x * conj(x') + a,
    590   //
    591   // alpha is a scalar; x is an n-element vector; a is an n-by-n Hermitian
    592   // matrix.
    593   virtual bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
    594                          float alpha,
    595                          const DeviceMemory<std::complex<float>> &x, int incx,
    596                          DeviceMemory<std::complex<float>> *a, int lda) = 0;
    597   virtual bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
    598                          double alpha,
    599                          const DeviceMemory<std::complex<double>> &x, int incx,
    600                          DeviceMemory<std::complex<double>> *a, int lda) = 0;
    601 
    602   // Performs a rank-2 update of a Hermitian matrix.
    603   //
    604   //     a <- alpha * x * conj(x') + conj(alpha) * y * conj(x') + a,
    605   //
    606   // alpha is a scalar; x and y are n-element vectors; a is an n-by-n Hermitian
    607   // matrix.
    608   virtual bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
    609                           std::complex<float> alpha,
    610                           const DeviceMemory<std::complex<float>> &x, int incx,
    611                           const DeviceMemory<std::complex<float>> &y, int incy,
    612                           DeviceMemory<std::complex<float>> *a, int lda) = 0;
    613   virtual bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
    614                           std::complex<double> alpha,
    615                           const DeviceMemory<std::complex<double>> &x, int incx,
    616                           const DeviceMemory<std::complex<double>> &y, int incy,
    617                           DeviceMemory<std::complex<double>> *a, int lda) = 0;
    618 
    619   // Computes a matrix-vector product using a Hermitian packed matrix.
    620   //
    621   //     y <- alpha * a * x + beta * y,
    622   //
    623   // alpha and beta are scalars; a is an n-by-n Hermitian matrix, supplied in
    624   // packed form; x and y are n-element vectors.
    625   virtual bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    626                           std::complex<float> alpha,
    627                           const DeviceMemory<std::complex<float>> &ap,
    628                           const DeviceMemory<std::complex<float>> &x, int incx,
    629                           std::complex<float> beta,
    630                           DeviceMemory<std::complex<float>> *y, int incy) = 0;
    631   virtual bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    632                           std::complex<double> alpha,
    633                           const DeviceMemory<std::complex<double>> &ap,
    634                           const DeviceMemory<std::complex<double>> &x, int incx,
    635                           std::complex<double> beta,
    636                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
    637 
    638   // Performs a rank-1 update of a Hermitian packed matrix.
    639   //
    640   //     a <- alpha * x * conj(x') + a,
    641   //
    642   // alpha is a scalar; x is an n-element vector; a is an n-by-n Hermitian
    643   // matrix, supplied in packed form.
    644   virtual bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
    645                          float alpha,
    646                          const DeviceMemory<std::complex<float>> &x, int incx,
    647                          DeviceMemory<std::complex<float>> *ap) = 0;
    648   virtual bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
    649                          double alpha,
    650                          const DeviceMemory<std::complex<double>> &x, int incx,
    651                          DeviceMemory<std::complex<double>> *ap) = 0;
    652 
    653   // Performs a rank-2 update of a Hermitian packed matrix.
    654   //
    655   //     a <- alpha * x * conj(x') + conj(alpha) * y * conj(x') + a,
    656   //
    657   // alpha is a scalar; x and y are n-element vectors; a is an n-by-n Hermitian
    658   // matrix, supplied in packed form.
    659   virtual bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
    660                           std::complex<float> alpha,
    661                           const DeviceMemory<std::complex<float>> &x, int incx,
    662                           const DeviceMemory<std::complex<float>> &y, int incy,
    663                           DeviceMemory<std::complex<float>> *ap) = 0;
    664   virtual bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
    665                           std::complex<double> alpha,
    666                           const DeviceMemory<std::complex<double>> &x, int incx,
    667                           const DeviceMemory<std::complex<double>> &y, int incy,
    668                           DeviceMemory<std::complex<double>> *ap) = 0;
    669 
    670   // Computes a matrix-vector product using a symmetric band matrix.
    671   //
    672   //     y <- alpha * a * x + beta * y,
    673   //
    674   // alpha and beta are scalars; a is an n-by-n symmetric band matrix, with k
    675   // super-diagonals; x and y are n-element vectors.
    676   virtual bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    677                           uint64 k, float alpha, const DeviceMemory<float> &a,
    678                           int lda, const DeviceMemory<float> &x, int incx,
    679                           float beta, DeviceMemory<float> *y, int incy) = 0;
    680   virtual bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    681                           uint64 k, double alpha, const DeviceMemory<double> &a,
    682                           int lda, const DeviceMemory<double> &x, int incx,
    683                           double beta, DeviceMemory<double> *y, int incy) = 0;
    684 
    685   // Computes a matrix-vector product using a symmetric packed matrix.
    686   //
    687   //     y <- alpha * a * x + beta * y,
    688   //
    689   // alpha and beta are scalars; a is an n-by-n symmetric matrix, supplied in
    690   // packed form; x and y are n-element vectors.
    691   virtual bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    692                           float alpha, const DeviceMemory<float> &ap,
    693                           const DeviceMemory<float> &x, int incx, float beta,
    694                           DeviceMemory<float> *y, int incy) = 0;
    695   virtual bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
    696                           double alpha, const DeviceMemory<double> &ap,
    697                           const DeviceMemory<double> &x, int incx, double beta,
    698                           DeviceMemory<double> *y, int incy) = 0;
    699 
    700   // Performs a rank-1 update of a symmetric packed matrix.
    701   //
    702   //     a <- alpha * x * x' + a,
    703   //
    704   // alpha is a scalar; x is an n-element vector; a is an n-by-n symmetric
    705   // matrix, supplied in packed form.
    706   virtual bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
    707                          float alpha, const DeviceMemory<float> &x, int incx,
    708                          DeviceMemory<float> *ap) = 0;
    709   virtual bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
    710                          double alpha, const DeviceMemory<double> &x, int incx,
    711                          DeviceMemory<double> *ap) = 0;
    712 
    713   // Performs a rank-2 update of a symmetric packed matrix.
    714   //
    715   //     a <- alpha * x * x' + alpha * y * x' + a,
    716   //
    717   // alpha is a scalar; x and y are n-element vectors; a is an n-by-n symmetric
    718   // matrix, supplied in packed form.
    719   virtual bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
    720                           float alpha, const DeviceMemory<float> &x, int incx,
    721                           const DeviceMemory<float> &y, int incy,
    722                           DeviceMemory<float> *ap) = 0;
    723   virtual bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
    724                           double alpha, const DeviceMemory<double> &x, int incx,
    725                           const DeviceMemory<double> &y, int incy,
    726                           DeviceMemory<double> *ap) = 0;
    727 
    728   // Computes a matrix-vector product for a symmetric matrix.
    729   //
    730   //     y <- alpha * a * x + beta * y,
    731   //
    732   // alpha and beta are scalars; a is an n-by-n symmetric matrix; x and y are
    733   // n-element vectors.
    734   virtual bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
    735                           float alpha, const DeviceMemory<float> &a, int lda,
    736                           const DeviceMemory<float> &x, int incx, float beta,
    737                           DeviceMemory<float> *y, int incy) = 0;
    738   virtual bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
    739                           double alpha, const DeviceMemory<double> &a, int lda,
    740                           const DeviceMemory<double> &x, int incx, double beta,
    741                           DeviceMemory<double> *y, int incy) = 0;
    742 
    743   // Performs a rank-1 update of a symmetric matrix.
    744   //
    745   //     a <- alpha * x * x' + a,
    746   //
    747   // alpha is a scalar; x is an n-element vector; a is an n-by-n symmetric
    748   // matrix.
    749   virtual bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
    750                          float alpha, const DeviceMemory<float> &x, int incx,
    751                          DeviceMemory<float> *a, int lda) = 0;
    752   virtual bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
    753                          double alpha, const DeviceMemory<double> &x, int incx,
    754                          DeviceMemory<double> *a, int lda) = 0;
    755 
    756   // Performs a rank-2 update of symmetric matrix.
    757   //
    758   //     a <- alpha * x * x' + alpha * y * x' + a,
    759   //
    760   // alpha is a scalar; x and y are n-element vectors; a is an n-by-n symmetric
    761   // matrix.
    762   virtual bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
    763                           float alpha, const DeviceMemory<float> &x, int incx,
    764                           const DeviceMemory<float> &y, int incy,
    765                           DeviceMemory<float> *a, int lda) = 0;
    766   virtual bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
    767                           double alpha, const DeviceMemory<double> &x, int incx,
    768                           const DeviceMemory<double> &y, int incy,
    769                           DeviceMemory<double> *a, int lda) = 0;
    770 
    771   // Computes a matrix-vector product using a triangular band matrix.
    772   //
    773   //     x <- a * x,
    774   // or
    775   //     x <- a' * x,
    776   // or
    777   //     x <- conj(a') * x,
    778   //
    779   // a is an n-by-n unit, or non-unit, upper or lower triangular band matrix,
    780   // with k+1 diagonals; x is a n-element vector.
    781   virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
    782                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    783                           uint64 k, const DeviceMemory<float> &a, int lda,
    784                           DeviceMemory<float> *x, int incx) = 0;
    785   virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
    786                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    787                           uint64 k, const DeviceMemory<double> &a, int lda,
    788                           DeviceMemory<double> *x, int incx) = 0;
    789   virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
    790                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    791                           uint64 k, const DeviceMemory<std::complex<float>> &a,
    792                           int lda, DeviceMemory<std::complex<float>> *x,
    793                           int incx) = 0;
    794   virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
    795                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    796                           uint64 k, const DeviceMemory<std::complex<double>> &a,
    797                           int lda, DeviceMemory<std::complex<double>> *x,
    798                           int incx) = 0;
    799 
    800   // Solves a system of linear equations whose coefficients are in a triangular
    801   // band matrix as below:
    802   //
    803   //     a * x = b,
    804   // or
    805   //     a' * x = b,
    806   // or
    807   //     conj(a') * x = b,
    808   //
    809   // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or
    810   // lower triangular band matrix, with k+1 diagonals.
    811   virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
    812                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    813                           uint64 k, const DeviceMemory<float> &a, int lda,
    814                           DeviceMemory<float> *x, int incx) = 0;
    815   virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
    816                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    817                           uint64 k, const DeviceMemory<double> &a, int lda,
    818                           DeviceMemory<double> *x, int incx) = 0;
    819   virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
    820                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    821                           uint64 k, const DeviceMemory<std::complex<float>> &a,
    822                           int lda, DeviceMemory<std::complex<float>> *x,
    823                           int incx) = 0;
    824   virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
    825                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    826                           uint64 k, const DeviceMemory<std::complex<double>> &a,
    827                           int lda, DeviceMemory<std::complex<double>> *x,
    828                           int incx) = 0;
    829 
    830   // Computes a matrix-vector product using a triangular packed matrix.
    831   //
    832   //     x <- a * x,
    833   // or
    834   //     x <- a' * x,
    835   // or
    836   //     x <- conj(a') * x,
    837   //
    838   // a is an n-by-n unit, or non-unit, upper or lower triangular matrix,
    839   // supplied in packed form; x is a n-element vector.
    840   virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
    841                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    842                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
    843                           int incx) = 0;
    844   virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
    845                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    846                           const DeviceMemory<double> &ap,
    847                           DeviceMemory<double> *x, int incx) = 0;
    848   virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
    849                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    850                           const DeviceMemory<std::complex<float>> &ap,
    851                           DeviceMemory<std::complex<float>> *x, int incx) = 0;
    852   virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
    853                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    854                           const DeviceMemory<std::complex<double>> &ap,
    855                           DeviceMemory<std::complex<double>> *x, int incx) = 0;
    856 
    857   // Solves a system of linear equations whose coefficients are in a triangular
    858   // packed matrix as below:
    859   //
    860   //     a * x = b,
    861   // or
    862   //     a' * x = b,
    863   // or
    864   //     conj(a') * x = b,
    865   //
    866   // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or
    867   // lower triangular matrix, supplied in packed form.
    868   virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
    869                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    870                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
    871                           int incx) = 0;
    872   virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
    873                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    874                           const DeviceMemory<double> &ap,
    875                           DeviceMemory<double> *x, int incx) = 0;
    876   virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
    877                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    878                           const DeviceMemory<std::complex<float>> &ap,
    879                           DeviceMemory<std::complex<float>> *x, int incx) = 0;
    880   virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
    881                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    882                           const DeviceMemory<std::complex<double>> &ap,
    883                           DeviceMemory<std::complex<double>> *x, int incx) = 0;
    884 
    885   // Computes a matrix-vector product using a triangular matrix.
    886   //
    887   //     x <- a * x,
    888   // or
    889   //     x <- a' * x,
    890   // or
    891   //     x <- conj(a') * x,
    892   //
    893   // a is an n-by-n unit, or non-unit, upper or lower triangular matrix; x is a
    894   // n-element vector.
    895   virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
    896                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    897                           const DeviceMemory<float> &a, int lda,
    898                           DeviceMemory<float> *x, int incx) = 0;
    899   virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
    900                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    901                           const DeviceMemory<double> &a, int lda,
    902                           DeviceMemory<double> *x, int incx) = 0;
    903   virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
    904                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    905                           const DeviceMemory<std::complex<float>> &a, int lda,
    906                           DeviceMemory<std::complex<float>> *x, int incx) = 0;
    907   virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
    908                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    909                           const DeviceMemory<std::complex<double>> &a, int lda,
    910                           DeviceMemory<std::complex<double>> *x, int incx) = 0;
    911 
    912   // Solves a system of linear equations whose coefficients are in a triangular
    913   // matrix as below:
    914   //
    915   //     a * x = b,
    916   // or
    917   //     a' * x = b,
    918   // or
    919   //     conj(a') * x = b,
    920   //
    921   // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or
    922   // lower triangular matrix.
    923   virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
    924                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    925                           const DeviceMemory<float> &a, int lda,
    926                           DeviceMemory<float> *x, int incx) = 0;
    927   virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
    928                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    929                           const DeviceMemory<double> &a, int lda,
    930                           DeviceMemory<double> *x, int incx) = 0;
    931   virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
    932                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    933                           const DeviceMemory<std::complex<float>> &a, int lda,
    934                           DeviceMemory<std::complex<float>> *x, int incx) = 0;
    935   virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
    936                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
    937                           const DeviceMemory<std::complex<double>> &a, int lda,
    938                           DeviceMemory<std::complex<double>> *x, int incx) = 0;
    939 
    940   // Computes a matrix-matrix product with general matrices:
    941   //
    942   //     c <- alpha * op(a) * op(b) + beta * c,
    943   //
    944   // op(X) is one of op(X) = X, or op(X) = X', or op(X) = conj(X'); alpha and
    945   // beta are scalars; a, b, and c are matrices; op(a) is an m-by-k matrix;
    946   // op(b) is a k-by-n matrix; c is an m-by-n matrix.
    947   //
    948   // Note: The half interface uses float precision internally; the version
    949   // that uses half precision internally is not yet supported. There is no
    950   // batched version of the half-precision interface.
    951   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
    952                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
    953                           float alpha, const DeviceMemory<Eigen::half> &a,
    954                           int lda, const DeviceMemory<Eigen::half> &b, int ldb,
    955                           float beta, DeviceMemory<Eigen::half> *c,
    956                           int ldc) = 0;
    957   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
    958                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
    959                           float alpha, const DeviceMemory<float> &a, int lda,
    960                           const DeviceMemory<float> &b, int ldb, float beta,
    961                           DeviceMemory<float> *c, int ldc) = 0;
    962   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
    963                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
    964                           double alpha, const DeviceMemory<double> &a, int lda,
    965                           const DeviceMemory<double> &b, int ldb, double beta,
    966                           DeviceMemory<double> *c, int ldc) = 0;
    967   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
    968                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
    969                           std::complex<float> alpha,
    970                           const DeviceMemory<std::complex<float>> &a, int lda,
    971                           const DeviceMemory<std::complex<float>> &b, int ldb,
    972                           std::complex<float> beta,
    973                           DeviceMemory<std::complex<float>> *c, int ldc) = 0;
    974   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
    975                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
    976                           std::complex<double> alpha,
    977                           const DeviceMemory<std::complex<double>> &a, int lda,
    978                           const DeviceMemory<std::complex<double>> &b, int ldb,
    979                           std::complex<double> beta,
    980                           DeviceMemory<std::complex<double>> *c, int ldc) = 0;
    981 
    982   virtual bool DoBlasGemmWithProfiling(
    983       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
    984       uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
    985       int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
    986       DeviceMemory<Eigen::half> *c, int ldc,
    987       ProfileResult *output_profile_result) = 0;
    988   virtual bool DoBlasGemmWithProfiling(
    989       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
    990       uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
    991       const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
    992       int ldc, ProfileResult *output_profile_result) = 0;
    993   virtual bool DoBlasGemmWithProfiling(
    994       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
    995       uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
    996       const DeviceMemory<double> &b, int ldb, double beta,
    997       DeviceMemory<double> *c, int ldc,
    998       ProfileResult *output_profile_result) = 0;
    999   virtual bool DoBlasGemmWithProfiling(
   1000       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1001       uint64 n, uint64 k, std::complex<float> alpha,
   1002       const DeviceMemory<std::complex<float>> &a, int lda,
   1003       const DeviceMemory<std::complex<float>> &b, int ldb,
   1004       std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
   1005       ProfileResult *output_profile_result) = 0;
   1006   virtual bool DoBlasGemmWithProfiling(
   1007       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1008       uint64 n, uint64 k, std::complex<double> alpha,
   1009       const DeviceMemory<std::complex<double>> &a, int lda,
   1010       const DeviceMemory<std::complex<double>> &b, int ldb,
   1011       std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
   1012       ProfileResult *output_profile_result) = 0;
   1013 
   1014   // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   1015   virtual bool GetBlasGemmAlgorithms(
   1016       std::vector<AlgorithmType> *out_algorithms) = 0;
   1017 
   1018   // Like DoBlasGemm, but accepts an algorithm and an compute type.
   1019   //
   1020   // The compute type lets you say (e.g.) that the inputs and outputs are
   1021   // Eigen::halfs, but you want the internal computations to be done with
   1022   // float32 precision.
   1023   //
   1024   // Note the subtle difference in the version that accepts Eigen:::half --
   1025   // alpha and beta have type const Eigen::half&, not float.
   1026   //
   1027   // If output_profile_result is not null, a failure here does not put the
   1028   // stream in a failure state.  Instead, success/failure is indicated by
   1029   // output_profile_result->is_valid().  This lets you use this function for
   1030   // choosing the best algorithm among many (some of which may fail) without
   1031   // creating a new Stream for each attempt.
   1032   virtual bool DoBlasGemmWithAlgorithm(
   1033       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1034       uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
   1035       const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int32> *c,
   1036       int ldc, ComputationType computation_type, AlgorithmType algorithm,
   1037       ProfileResult *output_profile_result) = 0;
   1038   virtual bool DoBlasGemmWithAlgorithm(
   1039       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1040       uint64 n, uint64 k, const Eigen::half &alpha,
   1041       const DeviceMemory<Eigen::half> &a, int lda,
   1042       const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
   1043       DeviceMemory<Eigen::half> *c, int ldc, ComputationType computation_type,
   1044       AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
   1045   virtual bool DoBlasGemmWithAlgorithm(
   1046       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1047       uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
   1048       const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
   1049       int ldc, ComputationType computation_type, AlgorithmType algorithm,
   1050       ProfileResult *output_profile_result) = 0;
   1051   virtual bool DoBlasGemmWithAlgorithm(
   1052       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1053       uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
   1054       const DeviceMemory<double> &b, int ldb, double beta,
   1055       DeviceMemory<double> *c, int ldc, ComputationType computation_type,
   1056       AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
   1057   virtual bool DoBlasGemmWithAlgorithm(
   1058       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1059       uint64 n, uint64 k, std::complex<float> alpha,
   1060       const DeviceMemory<std::complex<float>> &a, int lda,
   1061       const DeviceMemory<std::complex<float>> &b, int ldb,
   1062       std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
   1063       ComputationType computation_type, AlgorithmType algorithm,
   1064       ProfileResult *output_profile_result) = 0;
   1065   virtual bool DoBlasGemmWithAlgorithm(
   1066       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1067       uint64 n, uint64 k, std::complex<double> alpha,
   1068       const DeviceMemory<std::complex<double>> &a, int lda,
   1069       const DeviceMemory<std::complex<double>> &b, int ldb,
   1070       std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
   1071       ComputationType computation_type, AlgorithmType algorithm,
   1072       ProfileResult *output_profile_result) = 0;
   1073 
   1074   // Computes a batch of matrix-matrix product with general matrices.
   1075   // This is a batched version of DoBlasGemm.
   1076   // The batched GEMM computes matrix product for each input/output in a, b,
   1077   // and c, which contain batch_count DeviceMemory objects.
   1078   virtual bool DoBlasGemmBatched(
   1079       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1080       uint64 n, uint64 k, float alpha,
   1081       const port::ArraySlice<DeviceMemory<float> *> &a, int lda,
   1082       const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,
   1083       const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
   1084       int batch_count, ScratchAllocator *scratch_allocator) = 0;
   1085   virtual bool DoBlasGemmBatched(
   1086       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1087       uint64 n, uint64 k, double alpha,
   1088       const port::ArraySlice<DeviceMemory<double> *> &a, int lda,
   1089       const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta,
   1090       const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
   1091       int batch_count, ScratchAllocator *scratch_allocator) = 0;
   1092   virtual bool DoBlasGemmBatched(
   1093       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1094       uint64 n, uint64 k, std::complex<float> alpha,
   1095       const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda,
   1096       const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
   1097       std::complex<float> beta,
   1098       const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
   1099       int batch_count, ScratchAllocator *scratch_allocator) = 0;
   1100   virtual bool DoBlasGemmBatched(
   1101       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1102       uint64 n, uint64 k, std::complex<double> alpha,
   1103       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda,
   1104       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
   1105       std::complex<double> beta,
   1106       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
   1107       int batch_count, ScratchAllocator *scratch_allocator) = 0;
   1108 
   1109   // Computes a matrix-matrix product where one input matrix is Hermitian:
   1110   //
   1111   //     c <- alpha * a * b + beta * c,
   1112   // or
   1113   //     c <- alpha * b * a + beta * c,
   1114   //
   1115   // alpha and beta are scalars; a is a Hermitian matrix; b and c are m-by-n
   1116   // matrices.
   1117   virtual bool DoBlasHemm(Stream *stream, blas::Side side,
   1118                           blas::UpperLower uplo, uint64 m, uint64 n,
   1119                           std::complex<float> alpha,
   1120                           const DeviceMemory<std::complex<float>> &a, int lda,
   1121                           const DeviceMemory<std::complex<float>> &b, int ldb,
   1122                           std::complex<float> beta,
   1123                           DeviceMemory<std::complex<float>> *c, int ldc) = 0;
   1124   virtual bool DoBlasHemm(Stream *stream, blas::Side side,
   1125                           blas::UpperLower uplo, uint64 m, uint64 n,
   1126                           std::complex<double> alpha,
   1127                           const DeviceMemory<std::complex<double>> &a, int lda,
   1128                           const DeviceMemory<std::complex<double>> &b, int ldb,
   1129                           std::complex<double> beta,
   1130                           DeviceMemory<std::complex<double>> *c, int ldc) = 0;
   1131 
   1132   // Performs a Hermitian rank-k update.
   1133   //
   1134   //     c <- alpha * a * conj(a') + beta * c,
   1135   // or
   1136   //     c <- alpha * conj(a') * a + beta * c,
   1137   //
   1138   // alpha and beta are scalars; c is a n-by-n Hermitian matrix; a is an n-by-k
   1139   // matrix in the first case and a k-by-n matrix in the second case.
   1140   virtual bool DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   1141                           blas::Transpose trans, uint64 n, uint64 k,
   1142                           float alpha,
   1143                           const DeviceMemory<std::complex<float>> &a, int lda,
   1144                           float beta, DeviceMemory<std::complex<float>> *c,
   1145                           int ldc) = 0;
   1146   virtual bool DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   1147                           blas::Transpose trans, uint64 n, uint64 k,
   1148                           double alpha,
   1149                           const DeviceMemory<std::complex<double>> &a, int lda,
   1150                           double beta, DeviceMemory<std::complex<double>> *c,
   1151                           int ldc) = 0;
   1152 
   1153   // Performs a Hermitian rank-2k update.
   1154   //
   1155   //     c <- alpha * a * conj(b') + conj(alpha) * b * conj(a') + beta * c,
   1156   // or
   1157   //     c <- alpha * conj(b') * a + conj(alpha) * conj(a') * b + beta * c,
   1158   //
   1159   // alpha and beta are scalars; c is a n-by-n Hermitian matrix; a and b are
   1160   // n-by-k matrices in the first case and k-by-n matrices in the second case.
   1161   virtual bool DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   1162                            blas::Transpose trans, uint64 n, uint64 k,
   1163                            std::complex<float> alpha,
   1164                            const DeviceMemory<std::complex<float>> &a, int lda,
   1165                            const DeviceMemory<std::complex<float>> &b, int ldb,
   1166                            float beta, DeviceMemory<std::complex<float>> *c,
   1167                            int ldc) = 0;
   1168   virtual bool DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   1169                            blas::Transpose trans, uint64 n, uint64 k,
   1170                            std::complex<double> alpha,
   1171                            const DeviceMemory<std::complex<double>> &a, int lda,
   1172                            const DeviceMemory<std::complex<double>> &b, int ldb,
   1173                            double beta, DeviceMemory<std::complex<double>> *c,
   1174                            int ldc) = 0;
   1175 
   1176   // Computes a matrix-matrix product where one input matrix is symmetric.
   1177   //
   1178   //     c <- alpha * a * b + beta * c,
   1179   // or
   1180   //     c <- alpha * b * a + beta * c,
   1181   //
   1182   // alpha and beta are scalars; a is a symmetric matrix; b and c are m-by-n
   1183   // matrices.
   1184   virtual bool DoBlasSymm(Stream *stream, blas::Side side,
   1185                           blas::UpperLower uplo, uint64 m, uint64 n,
   1186                           float alpha, const DeviceMemory<float> &a, int lda,
   1187                           const DeviceMemory<float> &b, int ldb, float beta,
   1188                           DeviceMemory<float> *c, int ldc) = 0;
   1189   virtual bool DoBlasSymm(Stream *stream, blas::Side side,
   1190                           blas::UpperLower uplo, uint64 m, uint64 n,
   1191                           double alpha, const DeviceMemory<double> &a, int lda,
   1192                           const DeviceMemory<double> &b, int ldb, double beta,
   1193                           DeviceMemory<double> *c, int ldc) = 0;
   1194   virtual bool DoBlasSymm(Stream *stream, blas::Side side,
   1195                           blas::UpperLower uplo, uint64 m, uint64 n,
   1196                           std::complex<float> alpha,
   1197                           const DeviceMemory<std::complex<float>> &a, int lda,
   1198                           const DeviceMemory<std::complex<float>> &b, int ldb,
   1199                           std::complex<float> beta,
   1200                           DeviceMemory<std::complex<float>> *c, int ldc) = 0;
   1201   virtual bool DoBlasSymm(Stream *stream, blas::Side side,
   1202                           blas::UpperLower uplo, uint64 m, uint64 n,
   1203                           std::complex<double> alpha,
   1204                           const DeviceMemory<std::complex<double>> &a, int lda,
   1205                           const DeviceMemory<std::complex<double>> &b, int ldb,
   1206                           std::complex<double> beta,
   1207                           DeviceMemory<std::complex<double>> *c, int ldc) = 0;
   1208 
   1209   // Performs a symmetric rank-k update.
   1210   //
   1211   //     c <- alpha * a * a' + beta * c,
   1212   // or
   1213   //     c <- alpha * a' * a + beta * c,
   1214   //
   1215   // alpha and beta are scalars; c is a n-by-n symmetric matrix; a is an n-by-k
   1216   // matrix in the first case and a k-by-n matrix in the second case.
   1217   virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   1218                           blas::Transpose trans, uint64 n, uint64 k,
   1219                           float alpha, const DeviceMemory<float> &a, int lda,
   1220                           float beta, DeviceMemory<float> *c, int ldc) = 0;
   1221   virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   1222                           blas::Transpose trans, uint64 n, uint64 k,
   1223                           double alpha, const DeviceMemory<double> &a, int lda,
   1224                           double beta, DeviceMemory<double> *c, int ldc) = 0;
   1225   virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   1226                           blas::Transpose trans, uint64 n, uint64 k,
   1227                           std::complex<float> alpha,
   1228                           const DeviceMemory<std::complex<float>> &a, int lda,
   1229                           std::complex<float> beta,
   1230                           DeviceMemory<std::complex<float>> *c, int ldc) = 0;
   1231   virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   1232                           blas::Transpose trans, uint64 n, uint64 k,
   1233                           std::complex<double> alpha,
   1234                           const DeviceMemory<std::complex<double>> &a, int lda,
   1235                           std::complex<double> beta,
   1236                           DeviceMemory<std::complex<double>> *c, int ldc) = 0;
   1237 
   1238   // Performs a symmetric rank-2k update.
   1239   //
   1240   //     c <- alpha * a * b' + alpha * b * a' + beta * c,
   1241   // or
   1242   //     c <- alpha * b' * a + alpha * a' * b + beta * c,
   1243   //
   1244   // alpha and beta are scalars; c is a n-by-n symmetric matrix; a and b are
   1245   // n-by-k matrices in the first case and k-by-n matrices in the second case.
   1246   virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   1247                            blas::Transpose trans, uint64 n, uint64 k,
   1248                            float alpha, const DeviceMemory<float> &a, int lda,
   1249                            const DeviceMemory<float> &b, int ldb, float beta,
   1250                            DeviceMemory<float> *c, int ldc) = 0;
   1251   virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   1252                            blas::Transpose trans, uint64 n, uint64 k,
   1253                            double alpha, const DeviceMemory<double> &a, int lda,
   1254                            const DeviceMemory<double> &b, int ldb, double beta,
   1255                            DeviceMemory<double> *c, int ldc) = 0;
   1256   virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   1257                            blas::Transpose trans, uint64 n, uint64 k,
   1258                            std::complex<float> alpha,
   1259                            const DeviceMemory<std::complex<float>> &a, int lda,
   1260                            const DeviceMemory<std::complex<float>> &b, int ldb,
   1261                            std::complex<float> beta,
   1262                            DeviceMemory<std::complex<float>> *c, int ldc) = 0;
   1263   virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   1264                            blas::Transpose trans, uint64 n, uint64 k,
   1265                            std::complex<double> alpha,
   1266                            const DeviceMemory<std::complex<double>> &a, int lda,
   1267                            const DeviceMemory<std::complex<double>> &b, int ldb,
   1268                            std::complex<double> beta,
   1269                            DeviceMemory<std::complex<double>> *c, int ldc) = 0;
   1270 
   1271   // Computes a matrix-matrix product where one input matrix is triangular.
   1272   //
   1273   //     b <- alpha * op(a) * b,
   1274   // or
   1275   //     b <- alpha * b * op(a)
   1276   //
   1277   // alpha is a scalar; b is an m-by-n matrix; a is a unit, or non-unit, upper
   1278   // or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a', or
   1279   // op(a) = conj(a').
   1280   virtual bool DoBlasTrmm(Stream *stream, blas::Side side,
   1281                           blas::UpperLower uplo, blas::Transpose transa,
   1282                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
   1283                           const DeviceMemory<float> &a, int lda,
   1284                           DeviceMemory<float> *b, int ldb) = 0;
   1285   virtual bool DoBlasTrmm(Stream *stream, blas::Side side,
   1286                           blas::UpperLower uplo, blas::Transpose transa,
   1287                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
   1288                           const DeviceMemory<double> &a, int lda,
   1289                           DeviceMemory<double> *b, int ldb) = 0;
   1290   virtual bool DoBlasTrmm(Stream *stream, blas::Side side,
   1291                           blas::UpperLower uplo, blas::Transpose transa,
   1292                           blas::Diagonal diag, uint64 m, uint64 n,
   1293                           std::complex<float> alpha,
   1294                           const DeviceMemory<std::complex<float>> &a, int lda,
   1295                           DeviceMemory<std::complex<float>> *b, int ldb) = 0;
   1296   virtual bool DoBlasTrmm(Stream *stream, blas::Side side,
   1297                           blas::UpperLower uplo, blas::Transpose transa,
   1298                           blas::Diagonal diag, uint64 m, uint64 n,
   1299                           std::complex<double> alpha,
   1300                           const DeviceMemory<std::complex<double>> &a, int lda,
   1301                           DeviceMemory<std::complex<double>> *b, int ldb) = 0;
   1302 
   1303   // Solves a triangular matrix equation.
   1304   //
   1305   //     op(a) * x = alpha * b,
   1306   // or
   1307   //     x * op(a) = alpha * b
   1308   //
   1309   // alpha is a scalar; x and b are m-by-n matrices; a is a unit, or non-unit,
   1310   // upper or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a',
   1311   // or op(a) = conj(a').
   1312   virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
   1313                           blas::UpperLower uplo, blas::Transpose transa,
   1314                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
   1315                           const DeviceMemory<float> &a, int lda,
   1316                           DeviceMemory<float> *b, int ldb) = 0;
   1317   virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
   1318                           blas::UpperLower uplo, blas::Transpose transa,
   1319                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
   1320                           const DeviceMemory<double> &a, int lda,
   1321                           DeviceMemory<double> *b, int ldb) = 0;
   1322   virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
   1323                           blas::UpperLower uplo, blas::Transpose transa,
   1324                           blas::Diagonal diag, uint64 m, uint64 n,
   1325                           std::complex<float> alpha,
   1326                           const DeviceMemory<std::complex<float>> &a, int lda,
   1327                           DeviceMemory<std::complex<float>> *b, int ldb) = 0;
   1328   virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
   1329                           blas::UpperLower uplo, blas::Transpose transa,
   1330                           blas::Diagonal diag, uint64 m, uint64 n,
   1331                           std::complex<double> alpha,
   1332                           const DeviceMemory<std::complex<double>> &a, int lda,
   1333                           DeviceMemory<std::complex<double>> *b, int ldb) = 0;
   1334 
   1335  protected:
   1336   BlasSupport() {}
   1337 
   1338  private:
   1339   SE_DISALLOW_COPY_AND_ASSIGN(BlasSupport);
   1340 };
   1341 
   1342 // Macro used to quickly declare overrides for abstract virtuals in the
   1343 // BlasSupport base class.
   1344 #define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES                  \
   1345   bool DoBlasAsum(Stream *stream, uint64 elem_count,                           \
   1346                   const DeviceMemory<float> &x, int incx,                      \
   1347                   DeviceMemory<float> *result) override;                       \
   1348   bool DoBlasAsum(Stream *stream, uint64 elem_count,                           \
   1349                   const DeviceMemory<double> &x, int incx,                     \
   1350                   DeviceMemory<double> *result) override;                      \
   1351   bool DoBlasAsum(Stream *stream, uint64 elem_count,                           \
   1352                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1353                   DeviceMemory<float> *result) override;                       \
   1354   bool DoBlasAsum(Stream *stream, uint64 elem_count,                           \
   1355                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1356                   DeviceMemory<double> *result) override;                      \
   1357   bool DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,              \
   1358                   const DeviceMemory<float> &x, int incx,                      \
   1359                   DeviceMemory<float> *y, int incy) override;                  \
   1360   bool DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,             \
   1361                   const DeviceMemory<double> &x, int incx,                     \
   1362                   DeviceMemory<double> *y, int incy) override;                 \
   1363   bool DoBlasAxpy(Stream *stream, uint64 elem_count,                           \
   1364                   std::complex<float> alpha,                                   \
   1365                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1366                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1367   bool DoBlasAxpy(Stream *stream, uint64 elem_count,                           \
   1368                   std::complex<double> alpha,                                  \
   1369                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1370                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1371   bool DoBlasCopy(Stream *stream, uint64 elem_count,                           \
   1372                   const DeviceMemory<float> &x, int incx,                      \
   1373                   DeviceMemory<float> *y, int incy) override;                  \
   1374   bool DoBlasCopy(Stream *stream, uint64 elem_count,                           \
   1375                   const DeviceMemory<double> &x, int incx,                     \
   1376                   DeviceMemory<double> *y, int incy) override;                 \
   1377   bool DoBlasCopy(Stream *stream, uint64 elem_count,                           \
   1378                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1379                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1380   bool DoBlasCopy(Stream *stream, uint64 elem_count,                           \
   1381                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1382                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1383   bool DoBlasDot(Stream *stream, uint64 elem_count,                            \
   1384                  const DeviceMemory<float> &x, int incx,                       \
   1385                  const DeviceMemory<float> &y, int incy,                       \
   1386                  DeviceMemory<float> *result) override;                        \
   1387   bool DoBlasDot(Stream *stream, uint64 elem_count,                            \
   1388                  const DeviceMemory<double> &x, int incx,                      \
   1389                  const DeviceMemory<double> &y, int incy,                      \
   1390                  DeviceMemory<double> *result) override;                       \
   1391   bool DoBlasDotc(Stream *stream, uint64 elem_count,                           \
   1392                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1393                   const DeviceMemory<std::complex<float>> &y, int incy,        \
   1394                   DeviceMemory<std::complex<float>> *result) override;         \
   1395   bool DoBlasDotc(Stream *stream, uint64 elem_count,                           \
   1396                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1397                   const DeviceMemory<std::complex<double>> &y, int incy,       \
   1398                   DeviceMemory<std::complex<double>> *result) override;        \
   1399   bool DoBlasDotu(Stream *stream, uint64 elem_count,                           \
   1400                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1401                   const DeviceMemory<std::complex<float>> &y, int incy,        \
   1402                   DeviceMemory<std::complex<float>> *result) override;         \
   1403   bool DoBlasDotu(Stream *stream, uint64 elem_count,                           \
   1404                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1405                   const DeviceMemory<std::complex<double>> &y, int incy,       \
   1406                   DeviceMemory<std::complex<double>> *result) override;        \
   1407   bool DoBlasNrm2(Stream *stream, uint64 elem_count,                           \
   1408                   const DeviceMemory<float> &x, int incx,                      \
   1409                   DeviceMemory<float> *result) override;                       \
   1410   bool DoBlasNrm2(Stream *stream, uint64 elem_count,                           \
   1411                   const DeviceMemory<double> &x, int incx,                     \
   1412                   DeviceMemory<double> *result) override;                      \
   1413   bool DoBlasNrm2(Stream *stream, uint64 elem_count,                           \
   1414                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1415                   DeviceMemory<float> *result) override;                       \
   1416   bool DoBlasNrm2(Stream *stream, uint64 elem_count,                           \
   1417                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1418                   DeviceMemory<double> *result) override;                      \
   1419   bool DoBlasRot(Stream *stream, uint64 elem_count, DeviceMemory<float> *x,    \
   1420                  int incx, DeviceMemory<float> *y, int incy, float c, float s) \
   1421       override;                                                                \
   1422   bool DoBlasRot(Stream *stream, uint64 elem_count, DeviceMemory<double> *x,   \
   1423                  int incx, DeviceMemory<double> *y, int incy, double c,        \
   1424                  double s) override;                                           \
   1425   bool DoBlasRot(Stream *stream, uint64 elem_count,                            \
   1426                  DeviceMemory<std::complex<float>> *x, int incx,               \
   1427                  DeviceMemory<std::complex<float>> *y, int incy, float c,      \
   1428                  float s) override;                                            \
   1429   bool DoBlasRot(Stream *stream, uint64 elem_count,                            \
   1430                  DeviceMemory<std::complex<double>> *x, int incx,              \
   1431                  DeviceMemory<std::complex<double>> *y, int incy, double c,    \
   1432                  double s) override;                                           \
   1433   bool DoBlasRotg(Stream *stream, DeviceMemory<float> *a,                      \
   1434                   DeviceMemory<float> *b, DeviceMemory<float> *c,              \
   1435                   DeviceMemory<float> *s) override;                            \
   1436   bool DoBlasRotg(Stream *stream, DeviceMemory<double> *a,                     \
   1437                   DeviceMemory<double> *b, DeviceMemory<double> *c,            \
   1438                   DeviceMemory<double> *s) override;                           \
   1439   bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,        \
   1440                   DeviceMemory<std::complex<float>> *b,                        \
   1441                   DeviceMemory<float> *c,                                      \
   1442                   DeviceMemory<std::complex<float>> *s) override;              \
   1443   bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,       \
   1444                   DeviceMemory<std::complex<double>> *b,                       \
   1445                   DeviceMemory<double> *c,                                     \
   1446                   DeviceMemory<std::complex<double>> *s) override;             \
   1447   bool DoBlasRotm(Stream *stream, uint64 elem_count, DeviceMemory<float> *x,   \
   1448                   int incx, DeviceMemory<float> *y, int incy,                  \
   1449                   const DeviceMemory<float> &param) override;                  \
   1450   bool DoBlasRotm(Stream *stream, uint64 elem_count, DeviceMemory<double> *x,  \
   1451                   int incx, DeviceMemory<double> *y, int incy,                 \
   1452                   const DeviceMemory<double> &param) override;                 \
   1453   bool DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,                    \
   1454                    DeviceMemory<float> *d2, DeviceMemory<float> *x1,           \
   1455                    const DeviceMemory<float> &y1, DeviceMemory<float> *param)  \
   1456       override;                                                                \
   1457   bool DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,                   \
   1458                    DeviceMemory<double> *d2, DeviceMemory<double> *x1,         \
   1459                    const DeviceMemory<double> &y1,                             \
   1460                    DeviceMemory<double> *param) override;                      \
   1461   bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha,              \
   1462                   DeviceMemory<float> *x, int incx) override;                  \
   1463   bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha,             \
   1464                   DeviceMemory<double> *x, int incx) override;                 \
   1465   bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha,              \
   1466                   DeviceMemory<std::complex<float>> *x, int incx) override;    \
   1467   bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha,             \
   1468                   DeviceMemory<std::complex<double>> *x, int incx) override;   \
   1469   bool DoBlasScal(Stream *stream, uint64 elem_count,                           \
   1470                   std::complex<float> alpha,                                   \
   1471                   DeviceMemory<std::complex<float>> *x, int incx) override;    \
   1472   bool DoBlasScal(Stream *stream, uint64 elem_count,                           \
   1473                   std::complex<double> alpha,                                  \
   1474                   DeviceMemory<std::complex<double>> *x, int incx) override;   \
   1475   bool DoBlasSwap(Stream *stream, uint64 elem_count, DeviceMemory<float> *x,   \
   1476                   int incx, DeviceMemory<float> *y, int incy) override;        \
   1477   bool DoBlasSwap(Stream *stream, uint64 elem_count, DeviceMemory<double> *x,  \
   1478                   int incx, DeviceMemory<double> *y, int incy) override;       \
   1479   bool DoBlasSwap(Stream *stream, uint64 elem_count,                           \
   1480                   DeviceMemory<std::complex<float>> *x, int incx,              \
   1481                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1482   bool DoBlasSwap(Stream *stream, uint64 elem_count,                           \
   1483                   DeviceMemory<std::complex<double>> *x, int incx,             \
   1484                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1485   bool DoBlasIamax(Stream *stream, uint64 elem_count,                          \
   1486                    const DeviceMemory<float> &x, int incx,                     \
   1487                    DeviceMemory<int> *result) override;                        \
   1488   bool DoBlasIamax(Stream *stream, uint64 elem_count,                          \
   1489                    const DeviceMemory<double> &x, int incx,                    \
   1490                    DeviceMemory<int> *result) override;                        \
   1491   bool DoBlasIamax(Stream *stream, uint64 elem_count,                          \
   1492                    const DeviceMemory<std::complex<float>> &x, int incx,       \
   1493                    DeviceMemory<int> *result) override;                        \
   1494   bool DoBlasIamax(Stream *stream, uint64 elem_count,                          \
   1495                    const DeviceMemory<std::complex<double>> &x, int incx,      \
   1496                    DeviceMemory<int> *result) override;                        \
   1497   bool DoBlasIamin(Stream *stream, uint64 elem_count,                          \
   1498                    const DeviceMemory<float> &x, int incx,                     \
   1499                    DeviceMemory<int> *result) override;                        \
   1500   bool DoBlasIamin(Stream *stream, uint64 elem_count,                          \
   1501                    const DeviceMemory<double> &x, int incx,                    \
   1502                    DeviceMemory<int> *result) override;                        \
   1503   bool DoBlasIamin(Stream *stream, uint64 elem_count,                          \
   1504                    const DeviceMemory<std::complex<float>> &x, int incx,       \
   1505                    DeviceMemory<int> *result) override;                        \
   1506   bool DoBlasIamin(Stream *stream, uint64 elem_count,                          \
   1507                    const DeviceMemory<std::complex<double>> &x, int incx,      \
   1508                    DeviceMemory<int> *result) override;                        \
   1509   bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1510                   uint64 kl, uint64 ku, float alpha,                           \
   1511                   const DeviceMemory<float> &a, int lda,                       \
   1512                   const DeviceMemory<float> &x, int incx, float beta,          \
   1513                   DeviceMemory<float> *y, int incy) override;                  \
   1514   bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1515                   uint64 kl, uint64 ku, double alpha,                          \
   1516                   const DeviceMemory<double> &a, int lda,                      \
   1517                   const DeviceMemory<double> &x, int incx, double beta,        \
   1518                   DeviceMemory<double> *y, int incy) override;                 \
   1519   bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1520                   uint64 kl, uint64 ku, std::complex<float> alpha,             \
   1521                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1522                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1523                   std::complex<float> beta,                                    \
   1524                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1525   bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1526                   uint64 kl, uint64 ku, std::complex<double> alpha,            \
   1527                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1528                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1529                   std::complex<double> beta,                                   \
   1530                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1531   bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1532                   float alpha, const DeviceMemory<float> &a, int lda,          \
   1533                   const DeviceMemory<float> &x, int incx, float beta,          \
   1534                   DeviceMemory<float> *y, int incy) override;                  \
   1535   bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1536                   double alpha, const DeviceMemory<double> &a, int lda,        \
   1537                   const DeviceMemory<double> &x, int incx, double beta,        \
   1538                   DeviceMemory<double> *y, int incy) override;                 \
   1539   bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1540                   std::complex<float> alpha,                                   \
   1541                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1542                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1543                   std::complex<float> beta,                                    \
   1544                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1545   bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n,   \
   1546                   std::complex<double> alpha,                                  \
   1547                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1548                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1549                   std::complex<double> beta,                                   \
   1550                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1551   bool DoBlasGemvWithProfiling(                                                \
   1552       Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,  \
   1553       const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,     \
   1554       int incx, float beta, DeviceMemory<float> *y, int incy,                  \
   1555       blas::ProfileResult *output_profile_result) override;                    \
   1556   bool DoBlasGemvWithProfiling(                                                \
   1557       Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, \
   1558       const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,   \
   1559       int incx, double beta, DeviceMemory<double> *y, int incy,                \
   1560       blas::ProfileResult *output_profile_result) override;                    \
   1561   bool DoBlasGemvWithProfiling(                                                \
   1562       Stream *stream, blas::Transpose trans, uint64 m, uint64 n,               \
   1563       std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,   \
   1564       int lda, const DeviceMemory<std::complex<float>> &x, int incx,           \
   1565       std::complex<float> beta, DeviceMemory<std::complex<float>> *y,          \
   1566       int incy, blas::ProfileResult *output_profile_result) override;          \
   1567   bool DoBlasGemvWithProfiling(                                                \
   1568       Stream *stream, blas::Transpose trans, uint64 m, uint64 n,               \
   1569       std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, \
   1570       int lda, const DeviceMemory<std::complex<double>> &x, int incx,          \
   1571       std::complex<double> beta, DeviceMemory<std::complex<double>> *y,        \
   1572       int incy, blas::ProfileResult *output_profile_result) override;          \
   1573   bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,              \
   1574                  const DeviceMemory<float> &x, int incx,                       \
   1575                  const DeviceMemory<float> &y, int incy,                       \
   1576                  DeviceMemory<float> *a, int lda) override;                    \
   1577   bool DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,             \
   1578                  const DeviceMemory<double> &x, int incx,                      \
   1579                  const DeviceMemory<double> &y, int incy,                      \
   1580                  DeviceMemory<double> *a, int lda) override;                   \
   1581   bool DoBlasGerc(Stream *stream, uint64 m, uint64 n,                          \
   1582                   std::complex<float> alpha,                                   \
   1583                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1584                   const DeviceMemory<std::complex<float>> &y, int incy,        \
   1585                   DeviceMemory<std::complex<float>> *a, int lda) override;     \
   1586   bool DoBlasGerc(Stream *stream, uint64 m, uint64 n,                          \
   1587                   std::complex<double> alpha,                                  \
   1588                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1589                   const DeviceMemory<std::complex<double>> &y, int incy,       \
   1590                   DeviceMemory<std::complex<double>> *a, int lda) override;    \
   1591   bool DoBlasGeru(Stream *stream, uint64 m, uint64 n,                          \
   1592                   std::complex<float> alpha,                                   \
   1593                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1594                   const DeviceMemory<std::complex<float>> &y, int incy,        \
   1595                   DeviceMemory<std::complex<float>> *a, int lda) override;     \
   1596   bool DoBlasGeru(Stream *stream, uint64 m, uint64 n,                          \
   1597                   std::complex<double> alpha,                                  \
   1598                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1599                   const DeviceMemory<std::complex<double>> &y, int incy,       \
   1600                   DeviceMemory<std::complex<double>> *a, int lda) override;    \
   1601   bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k,   \
   1602                   std::complex<float> alpha,                                   \
   1603                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1604                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1605                   std::complex<float> beta,                                    \
   1606                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1607   bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k,   \
   1608                   std::complex<double> alpha,                                  \
   1609                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1610                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1611                   std::complex<double> beta,                                   \
   1612                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1613   bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1614                   std::complex<float> alpha,                                   \
   1615                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1616                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1617                   std::complex<float> beta,                                    \
   1618                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1619   bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1620                   std::complex<double> alpha,                                  \
   1621                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1622                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1623                   std::complex<double> beta,                                   \
   1624                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1625   bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \
   1626                  const DeviceMemory<std::complex<float>> &x, int incx,         \
   1627                  DeviceMemory<std::complex<float>> *a, int lda) override;      \
   1628   bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,              \
   1629                  double alpha, const DeviceMemory<std::complex<double>> &x,    \
   1630                  int incx, DeviceMemory<std::complex<double>> *a, int lda)     \
   1631       override;                                                                \
   1632   bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1633                   std::complex<float> alpha,                                   \
   1634                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1635                   const DeviceMemory<std::complex<float>> &y, int incy,        \
   1636                   DeviceMemory<std::complex<float>> *a, int lda) override;     \
   1637   bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1638                   std::complex<double> alpha,                                  \
   1639                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1640                   const DeviceMemory<std::complex<double>> &y, int incy,       \
   1641                   DeviceMemory<std::complex<double>> *a, int lda) override;    \
   1642   bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1643                   std::complex<float> alpha,                                   \
   1644                   const DeviceMemory<std::complex<float>> &ap,                 \
   1645                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1646                   std::complex<float> beta,                                    \
   1647                   DeviceMemory<std::complex<float>> *y, int incy) override;    \
   1648   bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1649                   std::complex<double> alpha,                                  \
   1650                   const DeviceMemory<std::complex<double>> &ap,                \
   1651                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1652                   std::complex<double> beta,                                   \
   1653                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
   1654   bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \
   1655                  const DeviceMemory<std::complex<float>> &x, int incx,         \
   1656                  DeviceMemory<std::complex<float>> *ap) override;              \
   1657   bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,              \
   1658                  double alpha, const DeviceMemory<std::complex<double>> &x,    \
   1659                  int incx, DeviceMemory<std::complex<double>> *ap) override;   \
   1660   bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1661                   std::complex<float> alpha,                                   \
   1662                   const DeviceMemory<std::complex<float>> &x, int incx,        \
   1663                   const DeviceMemory<std::complex<float>> &y, int incy,        \
   1664                   DeviceMemory<std::complex<float>> *ap) override;             \
   1665   bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1666                   std::complex<double> alpha,                                  \
   1667                   const DeviceMemory<std::complex<double>> &x, int incx,       \
   1668                   const DeviceMemory<std::complex<double>> &y, int incy,       \
   1669                   DeviceMemory<std::complex<double>> *ap) override;            \
   1670   bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k,   \
   1671                   float alpha, const DeviceMemory<float> &a, int lda,          \
   1672                   const DeviceMemory<float> &x, int incx, float beta,          \
   1673                   DeviceMemory<float> *y, int incy) override;                  \
   1674   bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k,   \
   1675                   double alpha, const DeviceMemory<double> &a, int lda,        \
   1676                   const DeviceMemory<double> &x, int incx, double beta,        \
   1677                   DeviceMemory<double> *y, int incy) override;                 \
   1678   bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1679                   float alpha, const DeviceMemory<float> &ap,                  \
   1680                   const DeviceMemory<float> &x, int incx, float beta,          \
   1681                   DeviceMemory<float> *y, int incy) override;                  \
   1682   bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1683                   double alpha, const DeviceMemory<double> &ap,                \
   1684                   const DeviceMemory<double> &x, int incx, double beta,        \
   1685                   DeviceMemory<double> *y, int incy) override;                 \
   1686   bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \
   1687                  const DeviceMemory<float> &x, int incx,                       \
   1688                  DeviceMemory<float> *ap) override;                            \
   1689   bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,              \
   1690                  double alpha, const DeviceMemory<double> &x, int incx,        \
   1691                  DeviceMemory<double> *ap) override;                           \
   1692   bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1693                   float alpha, const DeviceMemory<float> &x, int incx,         \
   1694                   const DeviceMemory<float> &y, int incy,                      \
   1695                   DeviceMemory<float> *ap) override;                           \
   1696   bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1697                   double alpha, const DeviceMemory<double> &x, int incx,       \
   1698                   const DeviceMemory<double> &y, int incy,                     \
   1699                   DeviceMemory<double> *ap) override;                          \
   1700   bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1701                   float alpha, const DeviceMemory<float> &a, int lda,          \
   1702                   const DeviceMemory<float> &x, int incx, float beta,          \
   1703                   DeviceMemory<float> *y, int incy) override;                  \
   1704   bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1705                   double alpha, const DeviceMemory<double> &a, int lda,        \
   1706                   const DeviceMemory<double> &x, int incx, double beta,        \
   1707                   DeviceMemory<double> *y, int incy) override;                 \
   1708   bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \
   1709                  const DeviceMemory<float> &x, int incx,                       \
   1710                  DeviceMemory<float> *a, int lda) override;                    \
   1711   bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,              \
   1712                  double alpha, const DeviceMemory<double> &x, int incx,        \
   1713                  DeviceMemory<double> *a, int lda) override;                   \
   1714   bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1715                   float alpha, const DeviceMemory<float> &x, int incx,         \
   1716                   const DeviceMemory<float> &y, int incy,                      \
   1717                   DeviceMemory<float> *a, int lda) override;                   \
   1718   bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,             \
   1719                   double alpha, const DeviceMemory<double> &x, int incx,       \
   1720                   const DeviceMemory<double> &y, int incy,                     \
   1721                   DeviceMemory<double> *a, int lda) override;                  \
   1722   bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,                       \
   1723                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1724                   uint64 k, const DeviceMemory<float> &a, int lda,             \
   1725                   DeviceMemory<float> *x, int incx) override;                  \
   1726   bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,                       \
   1727                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1728                   uint64 k, const DeviceMemory<double> &a, int lda,            \
   1729                   DeviceMemory<double> *x, int incx) override;                 \
   1730   bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,                       \
   1731                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1732                   uint64 k, const DeviceMemory<std::complex<float>> &a,        \
   1733                   int lda, DeviceMemory<std::complex<float>> *x, int incx)     \
   1734       override;                                                                \
   1735   bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo,                       \
   1736                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1737                   uint64 k, const DeviceMemory<std::complex<double>> &a,       \
   1738                   int lda, DeviceMemory<std::complex<double>> *x, int incx)    \
   1739       override;                                                                \
   1740   bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,                       \
   1741                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1742                   uint64 k, const DeviceMemory<float> &a, int lda,             \
   1743                   DeviceMemory<float> *x, int incx) override;                  \
   1744   bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,                       \
   1745                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1746                   uint64 k, const DeviceMemory<double> &a, int lda,            \
   1747                   DeviceMemory<double> *x, int incx) override;                 \
   1748   bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,                       \
   1749                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1750                   uint64 k, const DeviceMemory<std::complex<float>> &a,        \
   1751                   int lda, DeviceMemory<std::complex<float>> *x, int incx)     \
   1752       override;                                                                \
   1753   bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo,                       \
   1754                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1755                   uint64 k, const DeviceMemory<std::complex<double>> &a,       \
   1756                   int lda, DeviceMemory<std::complex<double>> *x, int incx)    \
   1757       override;                                                                \
   1758   bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,                       \
   1759                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1760                   const DeviceMemory<float> &ap, DeviceMemory<float> *x,       \
   1761                   int incx) override;                                          \
   1762   bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,                       \
   1763                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1764                   const DeviceMemory<double> &ap, DeviceMemory<double> *x,     \
   1765                   int incx) override;                                          \
   1766   bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,                       \
   1767                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1768                   const DeviceMemory<std::complex<float>> &ap,                 \
   1769                   DeviceMemory<std::complex<float>> *x, int incx) override;    \
   1770   bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo,                       \
   1771                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1772                   const DeviceMemory<std::complex<double>> &ap,                \
   1773                   DeviceMemory<std::complex<double>> *x, int incx) override;   \
   1774   bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,                       \
   1775                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1776                   const DeviceMemory<float> &ap, DeviceMemory<float> *x,       \
   1777                   int incx) override;                                          \
   1778   bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,                       \
   1779                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1780                   const DeviceMemory<double> &ap, DeviceMemory<double> *x,     \
   1781                   int incx) override;                                          \
   1782   bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,                       \
   1783                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1784                   const DeviceMemory<std::complex<float>> &ap,                 \
   1785                   DeviceMemory<std::complex<float>> *x, int incx) override;    \
   1786   bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo,                       \
   1787                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1788                   const DeviceMemory<std::complex<double>> &ap,                \
   1789                   DeviceMemory<std::complex<double>> *x, int incx) override;   \
   1790   bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,                       \
   1791                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1792                   const DeviceMemory<float> &a, int lda,                       \
   1793                   DeviceMemory<float> *x, int incx) override;                  \
   1794   bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,                       \
   1795                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1796                   const DeviceMemory<double> &a, int lda,                      \
   1797                   DeviceMemory<double> *x, int incx) override;                 \
   1798   bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,                       \
   1799                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1800                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1801                   DeviceMemory<std::complex<float>> *x, int incx) override;    \
   1802   bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo,                       \
   1803                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1804                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1805                   DeviceMemory<std::complex<double>> *x, int incx) override;   \
   1806   bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,                       \
   1807                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1808                   const DeviceMemory<float> &a, int lda,                       \
   1809                   DeviceMemory<float> *x, int incx) override;                  \
   1810   bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,                       \
   1811                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1812                   const DeviceMemory<double> &a, int lda,                      \
   1813                   DeviceMemory<double> *x, int incx) override;                 \
   1814   bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,                       \
   1815                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1816                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1817                   DeviceMemory<std::complex<float>> *x, int incx) override;    \
   1818   bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo,                       \
   1819                   blas::Transpose trans, blas::Diagonal diag, uint64 n,        \
   1820                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1821                   DeviceMemory<std::complex<double>> *x, int incx) override;   \
   1822   bool DoBlasGemm(Stream *stream, blas::Transpose transa,                      \
   1823                   blas::Transpose transb, uint64 m, uint64 n, uint64 k,        \
   1824                   float alpha, const DeviceMemory<Eigen::half> &a, int lda,    \
   1825                   const DeviceMemory<Eigen::half> &b, int ldb, float beta,     \
   1826                   DeviceMemory<Eigen::half> *c, int ldc) override;             \
   1827   bool DoBlasGemm(Stream *stream, blas::Transpose transa,                      \
   1828                   blas::Transpose transb, uint64 m, uint64 n, uint64 k,        \
   1829                   float alpha, const DeviceMemory<float> &a, int lda,          \
   1830                   const DeviceMemory<float> &b, int ldb, float beta,           \
   1831                   DeviceMemory<float> *c, int ldc) override;                   \
   1832   bool DoBlasGemm(Stream *stream, blas::Transpose transa,                      \
   1833                   blas::Transpose transb, uint64 m, uint64 n, uint64 k,        \
   1834                   double alpha, const DeviceMemory<double> &a, int lda,        \
   1835                   const DeviceMemory<double> &b, int ldb, double beta,         \
   1836                   DeviceMemory<double> *c, int ldc) override;                  \
   1837   bool DoBlasGemm(Stream *stream, blas::Transpose transa,                      \
   1838                   blas::Transpose transb, uint64 m, uint64 n, uint64 k,        \
   1839                   std::complex<float> alpha,                                   \
   1840                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1841                   const DeviceMemory<std::complex<float>> &b, int ldb,         \
   1842                   std::complex<float> beta,                                    \
   1843                   DeviceMemory<std::complex<float>> *c, int ldc) override;     \
   1844   bool DoBlasGemm(Stream *stream, blas::Transpose transa,                      \
   1845                   blas::Transpose transb, uint64 m, uint64 n, uint64 k,        \
   1846                   std::complex<double> alpha,                                  \
   1847                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1848                   const DeviceMemory<std::complex<double>> &b, int ldb,        \
   1849                   std::complex<double> beta,                                   \
   1850                   DeviceMemory<std::complex<double>> *c, int ldc) override;    \
   1851   bool DoBlasGemmWithProfiling(                                                \
   1852       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1853       uint64 m, uint64 n, uint64 k, float alpha,                               \
   1854       const DeviceMemory<Eigen::half> &a, int lda,                             \
   1855       const DeviceMemory<Eigen::half> &b, int ldb, float beta,                 \
   1856       DeviceMemory<Eigen::half> *c, int ldc,                                   \
   1857       blas::ProfileResult *output_profile_result) override;                    \
   1858   bool DoBlasGemmWithProfiling(                                                \
   1859       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1860       uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \
   1861       int lda, const DeviceMemory<float> &b, int ldb, float beta,              \
   1862       DeviceMemory<float> *c, int ldc,                                         \
   1863       blas::ProfileResult *output_profile_result) override;                    \
   1864   bool DoBlasGemmWithProfiling(                                                \
   1865       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1866       uint64 m, uint64 n, uint64 k, double alpha,                              \
   1867       const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
   1868       int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
   1869       blas::ProfileResult *output_profile_result) override;                    \
   1870   bool DoBlasGemmWithProfiling(                                                \
   1871       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1872       uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
   1873       const DeviceMemory<std::complex<float>> &a, int lda,                     \
   1874       const DeviceMemory<std::complex<float>> &b, int ldb,                     \
   1875       std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
   1876       blas::ProfileResult *output_profile_result) override;                    \
   1877   bool DoBlasGemmWithProfiling(                                                \
   1878       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1879       uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
   1880       const DeviceMemory<std::complex<double>> &a, int lda,                    \
   1881       const DeviceMemory<std::complex<double>> &b, int ldb,                    \
   1882       std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
   1883       int ldc, blas::ProfileResult *output_profile_result) override;           \
   1884   bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms) \
   1885       override;                                                                \
   1886   bool DoBlasGemmWithAlgorithm(                                                \
   1887       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1888       uint64 m, uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a,    \
   1889       int lda, const DeviceMemory<int8> &b, int ldb, int beta,                 \
   1890       DeviceMemory<int> *c, int ldc, blas::ComputationType computation_type,   \
   1891       blas::AlgorithmType algorithm,                                           \
   1892       blas::ProfileResult *output_profile_result) override;                    \
   1893   bool DoBlasGemmWithAlgorithm(                                                \
   1894       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1895       uint64 m, uint64 n, uint64 k, const Eigen::half &alpha,                  \
   1896       const DeviceMemory<Eigen::half> &a, int lda,                             \
   1897       const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,    \
   1898       DeviceMemory<Eigen::half> *c, int ldc,                                   \
   1899       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
   1900       blas::ProfileResult *output_profile_result) override;                    \
   1901   bool DoBlasGemmWithAlgorithm(                                                \
   1902       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1903       uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \
   1904       int lda, const DeviceMemory<float> &b, int ldb, float beta,              \
   1905       DeviceMemory<float> *c, int ldc, blas::ComputationType computation_type, \
   1906       blas::AlgorithmType algorithm,                                           \
   1907       blas::ProfileResult *output_profile_result) override;                    \
   1908   bool DoBlasGemmWithAlgorithm(                                                \
   1909       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1910       uint64 m, uint64 n, uint64 k, double alpha,                              \
   1911       const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
   1912       int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
   1913       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
   1914       blas::ProfileResult *output_profile_result) override;                    \
   1915   bool DoBlasGemmWithAlgorithm(                                                \
   1916       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1917       uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
   1918       const DeviceMemory<std::complex<float>> &a, int lda,                     \
   1919       const DeviceMemory<std::complex<float>> &b, int ldb,                     \
   1920       std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
   1921       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
   1922       blas::ProfileResult *output_profile_result) override;                    \
   1923   bool DoBlasGemmWithAlgorithm(                                                \
   1924       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1925       uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
   1926       const DeviceMemory<std::complex<double>> &a, int lda,                    \
   1927       const DeviceMemory<std::complex<double>> &b, int ldb,                    \
   1928       std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
   1929       int ldc, blas::ComputationType computation_type,                         \
   1930       blas::AlgorithmType algorithm,                                           \
   1931       blas::ProfileResult *output_profile_result) override;                    \
   1932   bool DoBlasGemmBatched(                                                      \
   1933       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1934       uint64 m, uint64 n, uint64 k, float alpha,                               \
   1935       const port::ArraySlice<DeviceMemory<float> *> &a, int lda,               \
   1936       const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,   \
   1937       const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,               \
   1938       int batch_count, ScratchAllocator *scratch_allocator) override;          \
   1939   bool DoBlasGemmBatched(                                                      \
   1940       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1941       uint64 m, uint64 n, uint64 k, double alpha,                              \
   1942       const port::ArraySlice<DeviceMemory<double> *> &a, int lda,              \
   1943       const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, \
   1944       const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,              \
   1945       int batch_count, ScratchAllocator *scratch_allocator) override;          \
   1946   bool DoBlasGemmBatched(                                                      \
   1947       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1948       uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
   1949       const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, \
   1950       const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, \
   1951       std::complex<float> beta,                                                \
   1952       const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, \
   1953       int batch_count, ScratchAllocator *scratch_allocator) override;          \
   1954   bool DoBlasGemmBatched(                                                      \
   1955       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
   1956       uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
   1957       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a,         \
   1958       int lda,                                                                 \
   1959       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b,         \
   1960       int ldb, std::complex<double> beta,                                      \
   1961       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c,         \
   1962       int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
   1963   bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   1964                   uint64 m, uint64 n, std::complex<float> alpha,               \
   1965                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1966                   const DeviceMemory<std::complex<float>> &b, int ldb,         \
   1967                   std::complex<float> beta,                                    \
   1968                   DeviceMemory<std::complex<float>> *c, int ldc) override;     \
   1969   bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   1970                   uint64 m, uint64 n, std::complex<double> alpha,              \
   1971                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1972                   const DeviceMemory<std::complex<double>> &b, int ldb,        \
   1973                   std::complex<double> beta,                                   \
   1974                   DeviceMemory<std::complex<double>> *c, int ldc) override;    \
   1975   bool DoBlasHerk(Stream *stream, blas::UpperLower uplo,                       \
   1976                   blas::Transpose trans, uint64 n, uint64 k, float alpha,      \
   1977                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   1978                   float beta, DeviceMemory<std::complex<float>> *c, int ldc)   \
   1979       override;                                                                \
   1980   bool DoBlasHerk(Stream *stream, blas::UpperLower uplo,                       \
   1981                   blas::Transpose trans, uint64 n, uint64 k, double alpha,     \
   1982                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   1983                   double beta, DeviceMemory<std::complex<double>> *c, int ldc) \
   1984       override;                                                                \
   1985   bool DoBlasHer2k(                                                            \
   1986       Stream *stream, blas::UpperLower uplo, blas::Transpose trans, uint64 n,  \
   1987       uint64 k, std::complex<float> alpha,                                     \
   1988       const DeviceMemory<std::complex<float>> &a, int lda,                     \
   1989       const DeviceMemory<std::complex<float>> &b, int ldb, float beta,         \
   1990       DeviceMemory<std::complex<float>> *c, int ldc) override;                 \
   1991   bool DoBlasHer2k(                                                            \
   1992       Stream *stream, blas::UpperLower uplo, blas::Transpose trans, uint64 n,  \
   1993       uint64 k, std::complex<double> alpha,                                    \
   1994       const DeviceMemory<std::complex<double>> &a, int lda,                    \
   1995       const DeviceMemory<std::complex<double>> &b, int ldb, double beta,       \
   1996       DeviceMemory<std::complex<double>> *c, int ldc) override;                \
   1997   bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   1998                   uint64 m, uint64 n, float alpha,                             \
   1999                   const DeviceMemory<float> &a, int lda,                       \
   2000                   const DeviceMemory<float> &b, int ldb, float beta,           \
   2001                   DeviceMemory<float> *c, int ldc) override;                   \
   2002   bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2003                   uint64 m, uint64 n, double alpha,                            \
   2004                   const DeviceMemory<double> &a, int lda,                      \
   2005                   const DeviceMemory<double> &b, int ldb, double beta,         \
   2006                   DeviceMemory<double> *c, int ldc) override;                  \
   2007   bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2008                   uint64 m, uint64 n, std::complex<float> alpha,               \
   2009                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   2010                   const DeviceMemory<std::complex<float>> &b, int ldb,         \
   2011                   std::complex<float> beta,                                    \
   2012                   DeviceMemory<std::complex<float>> *c, int ldc) override;     \
   2013   bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2014                   uint64 m, uint64 n, std::complex<double> alpha,              \
   2015                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   2016                   const DeviceMemory<std::complex<double>> &b, int ldb,        \
   2017                   std::complex<double> beta,                                   \
   2018                   DeviceMemory<std::complex<double>> *c, int ldc) override;    \
   2019   bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,                       \
   2020                   blas::Transpose trans, uint64 n, uint64 k, float alpha,      \
   2021                   const DeviceMemory<float> &a, int lda, float beta,           \
   2022                   DeviceMemory<float> *c, int ldc) override;                   \
   2023   bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,                       \
   2024                   blas::Transpose trans, uint64 n, uint64 k, double alpha,     \
   2025                   const DeviceMemory<double> &a, int lda, double beta,         \
   2026                   DeviceMemory<double> *c, int ldc) override;                  \
   2027   bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,                       \
   2028                   blas::Transpose trans, uint64 n, uint64 k,                   \
   2029                   std::complex<float> alpha,                                   \
   2030                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   2031                   std::complex<float> beta,                                    \
   2032                   DeviceMemory<std::complex<float>> *c, int ldc) override;     \
   2033   bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo,                       \
   2034                   blas::Transpose trans, uint64 n, uint64 k,                   \
   2035                   std::complex<double> alpha,                                  \
   2036                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   2037                   std::complex<double> beta,                                   \
   2038                   DeviceMemory<std::complex<double>> *c, int ldc) override;    \
   2039   bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,                      \
   2040                    blas::Transpose trans, uint64 n, uint64 k, float alpha,     \
   2041                    const DeviceMemory<float> &a, int lda,                      \
   2042                    const DeviceMemory<float> &b, int ldb, float beta,          \
   2043                    DeviceMemory<float> *c, int ldc) override;                  \
   2044   bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,                      \
   2045                    blas::Transpose trans, uint64 n, uint64 k, double alpha,    \
   2046                    const DeviceMemory<double> &a, int lda,                     \
   2047                    const DeviceMemory<double> &b, int ldb, double beta,        \
   2048                    DeviceMemory<double> *c, int ldc) override;                 \
   2049   bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,                      \
   2050                    blas::Transpose trans, uint64 n, uint64 k,                  \
   2051                    std::complex<float> alpha,                                  \
   2052                    const DeviceMemory<std::complex<float>> &a, int lda,        \
   2053                    const DeviceMemory<std::complex<float>> &b, int ldb,        \
   2054                    std::complex<float> beta,                                   \
   2055                    DeviceMemory<std::complex<float>> *c, int ldc) override;    \
   2056   bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,                      \
   2057                    blas::Transpose trans, uint64 n, uint64 k,                  \
   2058                    std::complex<double> alpha,                                 \
   2059                    const DeviceMemory<std::complex<double>> &a, int lda,       \
   2060                    const DeviceMemory<std::complex<double>> &b, int ldb,       \
   2061                    std::complex<double> beta,                                  \
   2062                    DeviceMemory<std::complex<double>> *c, int ldc) override;   \
   2063   bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2064                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2065                   uint64 n, float alpha, const DeviceMemory<float> &a,         \
   2066                   int lda, DeviceMemory<float> *b, int ldb) override;          \
   2067   bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2068                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2069                   uint64 n, double alpha, const DeviceMemory<double> &a,       \
   2070                   int lda, DeviceMemory<double> *b, int ldb) override;         \
   2071   bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2072                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2073                   uint64 n, std::complex<float> alpha,                         \
   2074                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   2075                   DeviceMemory<std::complex<float>> *b, int ldb) override;     \
   2076   bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2077                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2078                   uint64 n, std::complex<double> alpha,                        \
   2079                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   2080                   DeviceMemory<std::complex<double>> *b, int ldb) override;    \
   2081   bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2082                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2083                   uint64 n, float alpha, const DeviceMemory<float> &a,         \
   2084                   int lda, DeviceMemory<float> *b, int ldb) override;          \
   2085   bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2086                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2087                   uint64 n, double alpha, const DeviceMemory<double> &a,       \
   2088                   int lda, DeviceMemory<double> *b, int ldb) override;         \
   2089   bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2090                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2091                   uint64 n, std::complex<float> alpha,                         \
   2092                   const DeviceMemory<std::complex<float>> &a, int lda,         \
   2093                   DeviceMemory<std::complex<float>> *b, int ldb) override;     \
   2094   bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
   2095                   blas::Transpose transa, blas::Diagonal diag, uint64 m,       \
   2096                   uint64 n, std::complex<double> alpha,                        \
   2097                   const DeviceMemory<std::complex<double>> &a, int lda,        \
   2098                   DeviceMemory<std::complex<double>> *b, int ldb) override;
   2099 
   2100 }  // namespace blas
   2101 }  // namespace gputools
   2102 }  // namespace perftools
   2103 
   2104 #endif  // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
   2105