1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Exposes the family of BLAS routines as pre-canned high performance calls for 17 // use in conjunction with the StreamExecutor abstraction. 18 // 19 // Note that this interface is optionally supported by platforms; see 20 // StreamExecutor::SupportsBlas() for details. 21 // 22 // This abstraction makes it simple to entrain BLAS operations on GPU data into 23 // a Stream -- users typically will not use this API directly, but will use the 24 // Stream builder methods to entrain these operations "under the hood". For 25 // example: 26 // 27 // DeviceMemory<float> x = stream_exec->AllocateArray<float>(1024); 28 // DeviceMemory<float> y = stream_exec->AllocateArray<float>(1024); 29 // // ... populate x and y ... 30 // Stream stream{stream_exec}; 31 // stream 32 // .Init() 33 // .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1); 34 // SE_CHECK_OK(stream.BlockHostUntilDone()); 35 // 36 // By using stream operations in this manner the user can easily intermix custom 37 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS 38 // routines. 39 40 #ifndef TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ 41 #define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ 42 43 #include <complex> 44 #include "tensorflow/stream_executor/platform/port.h" 45 46 #include "tensorflow/stream_executor/lib/array_slice.h" 47 48 namespace Eigen { 49 struct half; 50 } // namespace Eigen 51 52 namespace perftools { 53 namespace gputools { 54 55 class Stream; 56 class ScratchAllocator; 57 58 template <typename ElemT> 59 class DeviceMemory; 60 61 namespace blas { 62 63 // Specifies whether the input matrix will be transposed or 64 // transposed+conjugated before any BLAS operations. 65 enum class Transpose { kNoTranspose, kTranspose, kConjugateTranspose }; 66 67 // Returns a name for t. 68 string TransposeString(Transpose t); 69 70 // Specifies whether the upper or lower triangular part of a 71 // symmetric/Hermitian matrix is used. 72 enum class UpperLower { kUpper, kLower }; 73 74 // Returns a name for ul. 75 string UpperLowerString(UpperLower ul); 76 77 // Specifies whether a matrix is unit triangular. 78 enum class Diagonal { kUnit, kNonUnit }; 79 80 // Returns a name for d. 81 string DiagonalString(Diagonal d); 82 83 // Specifies whether a Hermitian matrix appears on the left or right in 84 // operation. 85 enum class Side { kLeft, kRight }; 86 87 // Returns a name for s. 88 string SideString(Side s); 89 90 // Type with which intermediate computations of a blas routine are performed. 91 // 92 // Some blas calls can perform computations with a type that's different than 93 // the type of their inputs/outputs. This lets you e.g. multiply two matricies 94 // of int8s using float32s to store the matmul's intermediate values. 95 enum class ComputationType { 96 kF16, // 16-bit floating-point 97 kF32, // 32-bit floating-point 98 kF64, // 64-bit floating-point 99 kI32, // 32-bit integer 100 kComplexF32, // Complex number comprised of two f32s. 101 kComplexF64, // Complex number comprised of two f64s. 102 }; 103 104 // Converts a ComputationType to a string. 105 string ComputationTypeString(ComputationType ty); 106 107 // Opaque identifier for an "algorithm" used by a blas routine. This functions 108 // as a hint to the blas library. 109 typedef int64 AlgorithmType; 110 constexpr AlgorithmType kDefaultAlgorithm = -1; 111 constexpr AlgorithmType kDefaultBlasGemm = -2; 112 constexpr AlgorithmType kDefaultBlasGemv = -3; 113 constexpr AlgorithmType kNoAlgorithm = -4; 114 115 // blas uses -1 to represent the default algorithm. This happens to match up 116 // with the CUBLAS_GEMM_DFALT constant, so cuda_blas.cc is using static_cast 117 // to convert from AlgorithmType to cublasGemmAlgo_t, and uses a static_assert 118 // to ensure that this assumption does not break. 119 // If another blas implementation uses a different value for the default 120 // algorithm, then it needs to convert kDefaultGemmAlgo to that value 121 // (e.g. via a function called ToWhateverGemmAlgo). 122 constexpr AlgorithmType kDefaultGemmAlgo = -1; 123 124 // Describes the result of a performance experiment, usually timing the speed of 125 // a particular AlgorithmType. 126 // 127 // If the call we were benchmarking failed (a common occurrence; not all 128 // algorithms are valid for all calls), is_valid() will be false. 129 class ProfileResult { 130 public: 131 bool is_valid() const { return is_valid_; } 132 void set_is_valid(bool val) { is_valid_ = val; } 133 AlgorithmType algorithm() const { return algorithm_; } 134 void set_algorithm(AlgorithmType val) { algorithm_ = val; } 135 float elapsed_time_in_ms() const { return elapsed_time_in_ms_; } 136 void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; } 137 138 private: 139 bool is_valid_ = false; 140 AlgorithmType algorithm_ = kDefaultAlgorithm; 141 float elapsed_time_in_ms_ = std::numeric_limits<float>::max(); 142 }; 143 144 class AlgorithmConfig { 145 public: 146 AlgorithmConfig() : algorithm_(kDefaultAlgorithm) {} 147 explicit AlgorithmConfig(AlgorithmType algorithm) : algorithm_(algorithm) {} 148 AlgorithmType algorithm() const { return algorithm_; } 149 void set_algorithm(AlgorithmType val) { algorithm_ = val; } 150 bool operator==(const AlgorithmConfig &other) const { 151 return this->algorithm_ == other.algorithm_; 152 } 153 bool operator!=(const AlgorithmConfig &other) const { 154 return !(*this == other); 155 } 156 string ToString() const; 157 158 private: 159 AlgorithmType algorithm_; 160 }; 161 162 // BLAS support interface -- this can be derived from a GPU executor when the 163 // underlying platform has an BLAS library implementation available. See 164 // StreamExecutor::AsBlas(). 165 // 166 // Thread-hostile: CUDA associates a CUDA-context with a particular thread in 167 // the system. Any operation that a user attempts to perform by enqueueing BLAS 168 // operations on a thread not-associated with the CUDA-context has unknown 169 // behavior at the current time; see b/13176597 170 class BlasSupport { 171 public: 172 virtual ~BlasSupport() {} 173 174 // Computes the sum of magnitudes of the vector elements. 175 // result <- |Re x(1)| + |Im x(1)| + |Re x(2)| + |Im x(2)|+ ... + |Re x(n)| 176 // + |Im x(n)|. 177 // Note that Im x(i) = 0 for real types float/double. 178 virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, 179 const DeviceMemory<float> &x, int incx, 180 DeviceMemory<float> *result) = 0; 181 virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, 182 const DeviceMemory<double> &x, int incx, 183 DeviceMemory<double> *result) = 0; 184 virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, 185 const DeviceMemory<std::complex<float>> &x, int incx, 186 DeviceMemory<float> *result) = 0; 187 virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, 188 const DeviceMemory<std::complex<double>> &x, int incx, 189 DeviceMemory<double> *result) = 0; 190 191 // Performs a BLAS y <- ax+y operation. 192 virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, 193 const DeviceMemory<float> &x, int incx, 194 DeviceMemory<float> *y, int incy) = 0; 195 virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, 196 const DeviceMemory<double> &x, int incx, 197 DeviceMemory<double> *y, int incy) = 0; 198 virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, 199 std::complex<float> alpha, 200 const DeviceMemory<std::complex<float>> &x, int incx, 201 DeviceMemory<std::complex<float>> *y, int incy) = 0; 202 virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, 203 std::complex<double> alpha, 204 const DeviceMemory<std::complex<double>> &x, int incx, 205 DeviceMemory<std::complex<double>> *y, int incy) = 0; 206 207 // Copies vector to another vector: y <- x. 208 virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, 209 const DeviceMemory<float> &x, int incx, 210 DeviceMemory<float> *y, int incy) = 0; 211 virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, 212 const DeviceMemory<double> &x, int incx, 213 DeviceMemory<double> *y, int incy) = 0; 214 virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, 215 const DeviceMemory<std::complex<float>> &x, int incx, 216 DeviceMemory<std::complex<float>> *y, int incy) = 0; 217 virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, 218 const DeviceMemory<std::complex<double>> &x, int incx, 219 DeviceMemory<std::complex<double>> *y, int incy) = 0; 220 221 // Performs a BLAS dot product result <- x . y. 222 virtual bool DoBlasDot(Stream *stream, uint64 elem_count, 223 const DeviceMemory<float> &x, int incx, 224 const DeviceMemory<float> &y, int incy, 225 DeviceMemory<float> *result) = 0; 226 virtual bool DoBlasDot(Stream *stream, uint64 elem_count, 227 const DeviceMemory<double> &x, int incx, 228 const DeviceMemory<double> &y, int incy, 229 DeviceMemory<double> *result) = 0; 230 231 // Performs a BLAS dot product result <- conj(x) . y for complex types. 232 virtual bool DoBlasDotc(Stream *stream, uint64 elem_count, 233 const DeviceMemory<std::complex<float>> &x, int incx, 234 const DeviceMemory<std::complex<float>> &y, int incy, 235 DeviceMemory<std::complex<float>> *result) = 0; 236 virtual bool DoBlasDotc(Stream *stream, uint64 elem_count, 237 const DeviceMemory<std::complex<double>> &x, int incx, 238 const DeviceMemory<std::complex<double>> &y, int incy, 239 DeviceMemory<std::complex<double>> *result) = 0; 240 241 // Performs a BLAS dot product result <- x . y for complex types. Note that 242 // x is unconjugated in this routine. 243 virtual bool DoBlasDotu(Stream *stream, uint64 elem_count, 244 const DeviceMemory<std::complex<float>> &x, int incx, 245 const DeviceMemory<std::complex<float>> &y, int incy, 246 DeviceMemory<std::complex<float>> *result) = 0; 247 virtual bool DoBlasDotu(Stream *stream, uint64 elem_count, 248 const DeviceMemory<std::complex<double>> &x, int incx, 249 const DeviceMemory<std::complex<double>> &y, int incy, 250 DeviceMemory<std::complex<double>> *result) = 0; 251 252 // Computes the Euclidean norm of a vector: result <- ||x||. 253 // See the following link for more information of Euclidean norm: 254 // http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm 255 virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, 256 const DeviceMemory<float> &x, int incx, 257 DeviceMemory<float> *result) = 0; 258 virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, 259 const DeviceMemory<double> &x, int incx, 260 DeviceMemory<double> *result) = 0; 261 virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, 262 const DeviceMemory<std::complex<float>> &x, int incx, 263 DeviceMemory<float> *result) = 0; 264 virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, 265 const DeviceMemory<std::complex<double>> &x, int incx, 266 DeviceMemory<double> *result) = 0; 267 268 // Performs rotation of points in the plane: 269 // x(i) = c*x(i) + s*y(i) 270 // y(i) = c*y(i) - s*x(i). 271 virtual bool DoBlasRot(Stream *stream, uint64 elem_count, 272 DeviceMemory<float> *x, int incx, 273 DeviceMemory<float> *y, int incy, float c, 274 float s) = 0; 275 virtual bool DoBlasRot(Stream *stream, uint64 elem_count, 276 DeviceMemory<double> *x, int incx, 277 DeviceMemory<double> *y, int incy, double c, 278 double s) = 0; 279 virtual bool DoBlasRot(Stream *stream, uint64 elem_count, 280 DeviceMemory<std::complex<float>> *x, int incx, 281 DeviceMemory<std::complex<float>> *y, int incy, 282 float c, float s) = 0; 283 virtual bool DoBlasRot(Stream *stream, uint64 elem_count, 284 DeviceMemory<std::complex<double>> *x, int incx, 285 DeviceMemory<std::complex<double>> *y, int incy, 286 double c, double s) = 0; 287 288 // Computes the parameters for a Givens rotation. 289 // Given the Cartesian coordinates (a, b) of a point, these routines return 290 // the parameters c, s, r, and z associated with the Givens rotation. The 291 // parameters c and s define a unitary matrix such that: 292 // 293 // | c s |.| a | = | r | 294 // | -s c | | b | | 0 | 295 // 296 // The parameter z is defined such that if |a| > |b|, z is s; otherwise if 297 // c is not 0 z is 1/c; otherwise z is 1. 298 virtual bool DoBlasRotg(Stream *stream, DeviceMemory<float> *a, 299 DeviceMemory<float> *b, DeviceMemory<float> *c, 300 DeviceMemory<float> *s) = 0; 301 virtual bool DoBlasRotg(Stream *stream, DeviceMemory<double> *a, 302 DeviceMemory<double> *b, DeviceMemory<double> *c, 303 DeviceMemory<double> *s) = 0; 304 virtual bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, 305 DeviceMemory<std::complex<float>> *b, 306 DeviceMemory<float> *c, 307 DeviceMemory<std::complex<float>> *s) = 0; 308 virtual bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, 309 DeviceMemory<std::complex<double>> *b, 310 DeviceMemory<double> *c, 311 DeviceMemory<std::complex<double>> *s) = 0; 312 313 // Performs modified Givens rotation of points in the plane. 314 // Given two vectors x and y, each vector element of these vectors is replaced 315 // as follows: 316 // 317 // | x(i) | = H | x(i) | 318 // | y(i) | | y(i) | 319 // 320 // for i=1 to n, where H is a modified Givens transformation matrix whose 321 // values are stored in the param[1] through param[4] array. 322 // For more information please Google this routine. 323 virtual bool DoBlasRotm(Stream *stream, uint64 elem_count, 324 DeviceMemory<float> *x, int incx, 325 DeviceMemory<float> *y, int incy, 326 const DeviceMemory<float> ¶m) = 0; 327 virtual bool DoBlasRotm(Stream *stream, uint64 elem_count, 328 DeviceMemory<double> *x, int incx, 329 DeviceMemory<double> *y, int incy, 330 const DeviceMemory<double> ¶m) = 0; 331 332 // Computes the parameters for a modified Givens rotation. 333 // Given Cartesian coordinates (x1, y1) of an input vector, these routines 334 // compute the components of a modified Givens transformation matrix H that 335 // zeros the y-component of the resulting vector: 336 // 337 // | x1 | = H | x1 * sqrt(d1) | 338 // | 0 | | y1 * sqrt(d1) | 339 // 340 // For more information please Google this routine. 341 virtual bool DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, 342 DeviceMemory<float> *d2, DeviceMemory<float> *x1, 343 const DeviceMemory<float> &y1, 344 DeviceMemory<float> *param) = 0; 345 virtual bool DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, 346 DeviceMemory<double> *d2, DeviceMemory<double> *x1, 347 const DeviceMemory<double> &y1, 348 DeviceMemory<double> *param) = 0; 349 350 // Computes the product of a vector by a scalar: x <- a*x. 351 virtual bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, 352 DeviceMemory<float> *x, int incx) = 0; 353 virtual bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, 354 DeviceMemory<double> *x, int incx) = 0; 355 virtual bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, 356 DeviceMemory<std::complex<float>> *x, int incx) = 0; 357 virtual bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, 358 DeviceMemory<std::complex<double>> *x, int incx) = 0; 359 virtual bool DoBlasScal(Stream *stream, uint64 elem_count, 360 std::complex<float> alpha, 361 DeviceMemory<std::complex<float>> *x, int incx) = 0; 362 virtual bool DoBlasScal(Stream *stream, uint64 elem_count, 363 std::complex<double> alpha, 364 DeviceMemory<std::complex<double>> *x, int incx) = 0; 365 366 // Swaps a vector with another vector. 367 virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, 368 DeviceMemory<float> *x, int incx, 369 DeviceMemory<float> *y, int incy) = 0; 370 virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, 371 DeviceMemory<double> *x, int incx, 372 DeviceMemory<double> *y, int incy) = 0; 373 virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, 374 DeviceMemory<std::complex<float>> *x, int incx, 375 DeviceMemory<std::complex<float>> *y, int incy) = 0; 376 virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, 377 DeviceMemory<std::complex<double>> *x, int incx, 378 DeviceMemory<std::complex<double>> *y, int incy) = 0; 379 380 // Finds the index of the element with maximum absolute value. 381 virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, 382 const DeviceMemory<float> &x, int incx, 383 DeviceMemory<int> *result) = 0; 384 virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, 385 const DeviceMemory<double> &x, int incx, 386 DeviceMemory<int> *result) = 0; 387 virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, 388 const DeviceMemory<std::complex<float>> &x, int incx, 389 DeviceMemory<int> *result) = 0; 390 virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, 391 const DeviceMemory<std::complex<double>> &x, 392 int incx, DeviceMemory<int> *result) = 0; 393 394 // Finds the index of the element with minimum absolute value. 395 virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, 396 const DeviceMemory<float> &x, int incx, 397 DeviceMemory<int> *result) = 0; 398 virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, 399 const DeviceMemory<double> &x, int incx, 400 DeviceMemory<int> *result) = 0; 401 virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, 402 const DeviceMemory<std::complex<float>> &x, int incx, 403 DeviceMemory<int> *result) = 0; 404 virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, 405 const DeviceMemory<std::complex<double>> &x, 406 int incx, DeviceMemory<int> *result) = 0; 407 408 // Computes a matrix-vector product using a general band matrix: 409 // 410 // y <- alpha * a * x + beta * y, 411 // or 412 // y <- alpha * a' * x + beta * y, 413 // or 414 // y <- alpha * conj(a') * x + beta * y, 415 // 416 // alpha and beta are scalars; a is an m-by-n general band matrix, with kl 417 // sub-diagonals and ku super-diagonals; x is a vector with 418 // n(trans==kNoTranspose)/m(otherwise) elements; 419 // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements. 420 virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 421 uint64 n, uint64 kl, uint64 ku, float alpha, 422 const DeviceMemory<float> &a, int lda, 423 const DeviceMemory<float> &x, int incx, float beta, 424 DeviceMemory<float> *y, int incy) = 0; 425 virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 426 uint64 n, uint64 kl, uint64 ku, double alpha, 427 const DeviceMemory<double> &a, int lda, 428 const DeviceMemory<double> &x, int incx, double beta, 429 DeviceMemory<double> *y, int incy) = 0; 430 virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 431 uint64 n, uint64 kl, uint64 ku, 432 std::complex<float> alpha, 433 const DeviceMemory<std::complex<float>> &a, int lda, 434 const DeviceMemory<std::complex<float>> &x, int incx, 435 std::complex<float> beta, 436 DeviceMemory<std::complex<float>> *y, int incy) = 0; 437 virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 438 uint64 n, uint64 kl, uint64 ku, 439 std::complex<double> alpha, 440 const DeviceMemory<std::complex<double>> &a, int lda, 441 const DeviceMemory<std::complex<double>> &x, int incx, 442 std::complex<double> beta, 443 DeviceMemory<std::complex<double>> *y, int incy) = 0; 444 445 // Computes a matrix-vector product using a general matrix. 446 // 447 // y <- alpha * a * x + beta * y, 448 // or 449 // y <- alpha * a' * x + beta * y, 450 // or 451 // y <- alpha * conj(a') * x + beta * y, 452 // 453 // alpha and beta are scalars; a is an m-by-n general matrix; x is a vector 454 // with n(trans==kNoTranspose)/m(otherwise) elements; 455 // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements. 456 virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 457 uint64 n, float alpha, const DeviceMemory<float> &a, 458 int lda, const DeviceMemory<float> &x, int incx, 459 float beta, DeviceMemory<float> *y, int incy) = 0; 460 virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 461 uint64 n, double alpha, const DeviceMemory<double> &a, 462 int lda, const DeviceMemory<double> &x, int incx, 463 double beta, DeviceMemory<double> *y, int incy) = 0; 464 virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 465 uint64 n, std::complex<float> alpha, 466 const DeviceMemory<std::complex<float>> &a, int lda, 467 const DeviceMemory<std::complex<float>> &x, int incx, 468 std::complex<float> beta, 469 DeviceMemory<std::complex<float>> *y, int incy) = 0; 470 virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 471 uint64 n, std::complex<double> alpha, 472 const DeviceMemory<std::complex<double>> &a, int lda, 473 const DeviceMemory<std::complex<double>> &x, int incx, 474 std::complex<double> beta, 475 DeviceMemory<std::complex<double>> *y, int incy) = 0; 476 477 virtual bool DoBlasGemvWithProfiling( 478 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha, 479 const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, 480 int incx, float beta, DeviceMemory<float> *y, int incy, 481 ProfileResult *output_profile_result) = 0; 482 virtual bool DoBlasGemvWithProfiling( 483 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, 484 const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, 485 int incx, double beta, DeviceMemory<double> *y, int incy, 486 ProfileResult *output_profile_result) = 0; 487 virtual bool DoBlasGemvWithProfiling( 488 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, 489 std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a, 490 int lda, const DeviceMemory<std::complex<float>> &x, int incx, 491 std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy, 492 ProfileResult *output_profile_result) = 0; 493 virtual bool DoBlasGemvWithProfiling( 494 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, 495 std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, 496 int lda, const DeviceMemory<std::complex<double>> &x, int incx, 497 std::complex<double> beta, DeviceMemory<std::complex<double>> *y, 498 int incy, ProfileResult *output_profile_result) = 0; 499 500 // Performs a rank-1 update of a general matrix. 501 // 502 // a <- alpha * x * y' + a, 503 // 504 // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is 505 // an m-by-n general matrix. 506 virtual bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, 507 const DeviceMemory<float> &x, int incx, 508 const DeviceMemory<float> &y, int incy, 509 DeviceMemory<float> *a, int lda) = 0; 510 virtual bool DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, 511 const DeviceMemory<double> &x, int incx, 512 const DeviceMemory<double> &y, int incy, 513 DeviceMemory<double> *a, int lda) = 0; 514 515 // Performs a rank-1 update (conjugated) of a general matrix. 516 // 517 // a <- alpha * x * conj(y') + a, 518 // 519 // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is 520 // an m-by-n general matrix. 521 virtual bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, 522 std::complex<float> alpha, 523 const DeviceMemory<std::complex<float>> &x, int incx, 524 const DeviceMemory<std::complex<float>> &y, int incy, 525 DeviceMemory<std::complex<float>> *a, int lda) = 0; 526 virtual bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, 527 std::complex<double> alpha, 528 const DeviceMemory<std::complex<double>> &x, int incx, 529 const DeviceMemory<std::complex<double>> &y, int incy, 530 DeviceMemory<std::complex<double>> *a, int lda) = 0; 531 532 // Performs a rank-1 update (unconjugated) of a general matrix. 533 // 534 // a <- alpha * x * y' + a, 535 // 536 // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is 537 // an m-by-n general matrix. 538 virtual bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, 539 std::complex<float> alpha, 540 const DeviceMemory<std::complex<float>> &x, int incx, 541 const DeviceMemory<std::complex<float>> &y, int incy, 542 DeviceMemory<std::complex<float>> *a, int lda) = 0; 543 virtual bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, 544 std::complex<double> alpha, 545 const DeviceMemory<std::complex<double>> &x, int incx, 546 const DeviceMemory<std::complex<double>> &y, int incy, 547 DeviceMemory<std::complex<double>> *a, int lda) = 0; 548 549 // Computes a matrix-vector product using a Hermitian band matrix. 550 // 551 // y <- alpha * a * x + beta * y, 552 // 553 // alpha and beta are scalars; a is an n-by-n Hermitian band matrix, with k 554 // super-diagonals; x and y are n-element vectors. 555 virtual bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 556 uint64 k, std::complex<float> alpha, 557 const DeviceMemory<std::complex<float>> &a, int lda, 558 const DeviceMemory<std::complex<float>> &x, int incx, 559 std::complex<float> beta, 560 DeviceMemory<std::complex<float>> *y, int incy) = 0; 561 virtual bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 562 uint64 k, std::complex<double> alpha, 563 const DeviceMemory<std::complex<double>> &a, int lda, 564 const DeviceMemory<std::complex<double>> &x, int incx, 565 std::complex<double> beta, 566 DeviceMemory<std::complex<double>> *y, int incy) = 0; 567 568 // Computes a matrix-vector product using a Hermitian matrix. 569 // 570 // y <- alpha * a * x + beta * y, 571 // 572 // alpha and beta are scalars; a is an n-by-n Hermitian matrix; x and y are 573 // n-element vectors. 574 virtual bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, 575 std::complex<float> alpha, 576 const DeviceMemory<std::complex<float>> &a, int lda, 577 const DeviceMemory<std::complex<float>> &x, int incx, 578 std::complex<float> beta, 579 DeviceMemory<std::complex<float>> *y, int incy) = 0; 580 virtual bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, 581 std::complex<double> alpha, 582 const DeviceMemory<std::complex<double>> &a, int lda, 583 const DeviceMemory<std::complex<double>> &x, int incx, 584 std::complex<double> beta, 585 DeviceMemory<std::complex<double>> *y, int incy) = 0; 586 587 // Performs a rank-1 update of a Hermitian matrix. 588 // 589 // a <- alpha * x * conj(x') + a, 590 // 591 // alpha is a scalar; x is an n-element vector; a is an n-by-n Hermitian 592 // matrix. 593 virtual bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, 594 float alpha, 595 const DeviceMemory<std::complex<float>> &x, int incx, 596 DeviceMemory<std::complex<float>> *a, int lda) = 0; 597 virtual bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, 598 double alpha, 599 const DeviceMemory<std::complex<double>> &x, int incx, 600 DeviceMemory<std::complex<double>> *a, int lda) = 0; 601 602 // Performs a rank-2 update of a Hermitian matrix. 603 // 604 // a <- alpha * x * conj(x') + conj(alpha) * y * conj(x') + a, 605 // 606 // alpha is a scalar; x and y are n-element vectors; a is an n-by-n Hermitian 607 // matrix. 608 virtual bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, 609 std::complex<float> alpha, 610 const DeviceMemory<std::complex<float>> &x, int incx, 611 const DeviceMemory<std::complex<float>> &y, int incy, 612 DeviceMemory<std::complex<float>> *a, int lda) = 0; 613 virtual bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, 614 std::complex<double> alpha, 615 const DeviceMemory<std::complex<double>> &x, int incx, 616 const DeviceMemory<std::complex<double>> &y, int incy, 617 DeviceMemory<std::complex<double>> *a, int lda) = 0; 618 619 // Computes a matrix-vector product using a Hermitian packed matrix. 620 // 621 // y <- alpha * a * x + beta * y, 622 // 623 // alpha and beta are scalars; a is an n-by-n Hermitian matrix, supplied in 624 // packed form; x and y are n-element vectors. 625 virtual bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 626 std::complex<float> alpha, 627 const DeviceMemory<std::complex<float>> &ap, 628 const DeviceMemory<std::complex<float>> &x, int incx, 629 std::complex<float> beta, 630 DeviceMemory<std::complex<float>> *y, int incy) = 0; 631 virtual bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 632 std::complex<double> alpha, 633 const DeviceMemory<std::complex<double>> &ap, 634 const DeviceMemory<std::complex<double>> &x, int incx, 635 std::complex<double> beta, 636 DeviceMemory<std::complex<double>> *y, int incy) = 0; 637 638 // Performs a rank-1 update of a Hermitian packed matrix. 639 // 640 // a <- alpha * x * conj(x') + a, 641 // 642 // alpha is a scalar; x is an n-element vector; a is an n-by-n Hermitian 643 // matrix, supplied in packed form. 644 virtual bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, 645 float alpha, 646 const DeviceMemory<std::complex<float>> &x, int incx, 647 DeviceMemory<std::complex<float>> *ap) = 0; 648 virtual bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, 649 double alpha, 650 const DeviceMemory<std::complex<double>> &x, int incx, 651 DeviceMemory<std::complex<double>> *ap) = 0; 652 653 // Performs a rank-2 update of a Hermitian packed matrix. 654 // 655 // a <- alpha * x * conj(x') + conj(alpha) * y * conj(x') + a, 656 // 657 // alpha is a scalar; x and y are n-element vectors; a is an n-by-n Hermitian 658 // matrix, supplied in packed form. 659 virtual bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 660 std::complex<float> alpha, 661 const DeviceMemory<std::complex<float>> &x, int incx, 662 const DeviceMemory<std::complex<float>> &y, int incy, 663 DeviceMemory<std::complex<float>> *ap) = 0; 664 virtual bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 665 std::complex<double> alpha, 666 const DeviceMemory<std::complex<double>> &x, int incx, 667 const DeviceMemory<std::complex<double>> &y, int incy, 668 DeviceMemory<std::complex<double>> *ap) = 0; 669 670 // Computes a matrix-vector product using a symmetric band matrix. 671 // 672 // y <- alpha * a * x + beta * y, 673 // 674 // alpha and beta are scalars; a is an n-by-n symmetric band matrix, with k 675 // super-diagonals; x and y are n-element vectors. 676 virtual bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 677 uint64 k, float alpha, const DeviceMemory<float> &a, 678 int lda, const DeviceMemory<float> &x, int incx, 679 float beta, DeviceMemory<float> *y, int incy) = 0; 680 virtual bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 681 uint64 k, double alpha, const DeviceMemory<double> &a, 682 int lda, const DeviceMemory<double> &x, int incx, 683 double beta, DeviceMemory<double> *y, int incy) = 0; 684 685 // Computes a matrix-vector product using a symmetric packed matrix. 686 // 687 // y <- alpha * a * x + beta * y, 688 // 689 // alpha and beta are scalars; a is an n-by-n symmetric matrix, supplied in 690 // packed form; x and y are n-element vectors. 691 virtual bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 692 float alpha, const DeviceMemory<float> &ap, 693 const DeviceMemory<float> &x, int incx, float beta, 694 DeviceMemory<float> *y, int incy) = 0; 695 virtual bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 696 double alpha, const DeviceMemory<double> &ap, 697 const DeviceMemory<double> &x, int incx, double beta, 698 DeviceMemory<double> *y, int incy) = 0; 699 700 // Performs a rank-1 update of a symmetric packed matrix. 701 // 702 // a <- alpha * x * x' + a, 703 // 704 // alpha is a scalar; x is an n-element vector; a is an n-by-n symmetric 705 // matrix, supplied in packed form. 706 virtual bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, 707 float alpha, const DeviceMemory<float> &x, int incx, 708 DeviceMemory<float> *ap) = 0; 709 virtual bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, 710 double alpha, const DeviceMemory<double> &x, int incx, 711 DeviceMemory<double> *ap) = 0; 712 713 // Performs a rank-2 update of a symmetric packed matrix. 714 // 715 // a <- alpha * x * x' + alpha * y * x' + a, 716 // 717 // alpha is a scalar; x and y are n-element vectors; a is an n-by-n symmetric 718 // matrix, supplied in packed form. 719 virtual bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 720 float alpha, const DeviceMemory<float> &x, int incx, 721 const DeviceMemory<float> &y, int incy, 722 DeviceMemory<float> *ap) = 0; 723 virtual bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 724 double alpha, const DeviceMemory<double> &x, int incx, 725 const DeviceMemory<double> &y, int incy, 726 DeviceMemory<double> *ap) = 0; 727 728 // Computes a matrix-vector product for a symmetric matrix. 729 // 730 // y <- alpha * a * x + beta * y, 731 // 732 // alpha and beta are scalars; a is an n-by-n symmetric matrix; x and y are 733 // n-element vectors. 734 virtual bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, 735 float alpha, const DeviceMemory<float> &a, int lda, 736 const DeviceMemory<float> &x, int incx, float beta, 737 DeviceMemory<float> *y, int incy) = 0; 738 virtual bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, 739 double alpha, const DeviceMemory<double> &a, int lda, 740 const DeviceMemory<double> &x, int incx, double beta, 741 DeviceMemory<double> *y, int incy) = 0; 742 743 // Performs a rank-1 update of a symmetric matrix. 744 // 745 // a <- alpha * x * x' + a, 746 // 747 // alpha is a scalar; x is an n-element vector; a is an n-by-n symmetric 748 // matrix. 749 virtual bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, 750 float alpha, const DeviceMemory<float> &x, int incx, 751 DeviceMemory<float> *a, int lda) = 0; 752 virtual bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, 753 double alpha, const DeviceMemory<double> &x, int incx, 754 DeviceMemory<double> *a, int lda) = 0; 755 756 // Performs a rank-2 update of symmetric matrix. 757 // 758 // a <- alpha * x * x' + alpha * y * x' + a, 759 // 760 // alpha is a scalar; x and y are n-element vectors; a is an n-by-n symmetric 761 // matrix. 762 virtual bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, 763 float alpha, const DeviceMemory<float> &x, int incx, 764 const DeviceMemory<float> &y, int incy, 765 DeviceMemory<float> *a, int lda) = 0; 766 virtual bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, 767 double alpha, const DeviceMemory<double> &x, int incx, 768 const DeviceMemory<double> &y, int incy, 769 DeviceMemory<double> *a, int lda) = 0; 770 771 // Computes a matrix-vector product using a triangular band matrix. 772 // 773 // x <- a * x, 774 // or 775 // x <- a' * x, 776 // or 777 // x <- conj(a') * x, 778 // 779 // a is an n-by-n unit, or non-unit, upper or lower triangular band matrix, 780 // with k+1 diagonals; x is a n-element vector. 781 virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 782 blas::Transpose trans, blas::Diagonal diag, uint64 n, 783 uint64 k, const DeviceMemory<float> &a, int lda, 784 DeviceMemory<float> *x, int incx) = 0; 785 virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 786 blas::Transpose trans, blas::Diagonal diag, uint64 n, 787 uint64 k, const DeviceMemory<double> &a, int lda, 788 DeviceMemory<double> *x, int incx) = 0; 789 virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 790 blas::Transpose trans, blas::Diagonal diag, uint64 n, 791 uint64 k, const DeviceMemory<std::complex<float>> &a, 792 int lda, DeviceMemory<std::complex<float>> *x, 793 int incx) = 0; 794 virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 795 blas::Transpose trans, blas::Diagonal diag, uint64 n, 796 uint64 k, const DeviceMemory<std::complex<double>> &a, 797 int lda, DeviceMemory<std::complex<double>> *x, 798 int incx) = 0; 799 800 // Solves a system of linear equations whose coefficients are in a triangular 801 // band matrix as below: 802 // 803 // a * x = b, 804 // or 805 // a' * x = b, 806 // or 807 // conj(a') * x = b, 808 // 809 // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or 810 // lower triangular band matrix, with k+1 diagonals. 811 virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 812 blas::Transpose trans, blas::Diagonal diag, uint64 n, 813 uint64 k, const DeviceMemory<float> &a, int lda, 814 DeviceMemory<float> *x, int incx) = 0; 815 virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 816 blas::Transpose trans, blas::Diagonal diag, uint64 n, 817 uint64 k, const DeviceMemory<double> &a, int lda, 818 DeviceMemory<double> *x, int incx) = 0; 819 virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 820 blas::Transpose trans, blas::Diagonal diag, uint64 n, 821 uint64 k, const DeviceMemory<std::complex<float>> &a, 822 int lda, DeviceMemory<std::complex<float>> *x, 823 int incx) = 0; 824 virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 825 blas::Transpose trans, blas::Diagonal diag, uint64 n, 826 uint64 k, const DeviceMemory<std::complex<double>> &a, 827 int lda, DeviceMemory<std::complex<double>> *x, 828 int incx) = 0; 829 830 // Computes a matrix-vector product using a triangular packed matrix. 831 // 832 // x <- a * x, 833 // or 834 // x <- a' * x, 835 // or 836 // x <- conj(a') * x, 837 // 838 // a is an n-by-n unit, or non-unit, upper or lower triangular matrix, 839 // supplied in packed form; x is a n-element vector. 840 virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 841 blas::Transpose trans, blas::Diagonal diag, uint64 n, 842 const DeviceMemory<float> &ap, DeviceMemory<float> *x, 843 int incx) = 0; 844 virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 845 blas::Transpose trans, blas::Diagonal diag, uint64 n, 846 const DeviceMemory<double> &ap, 847 DeviceMemory<double> *x, int incx) = 0; 848 virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 849 blas::Transpose trans, blas::Diagonal diag, uint64 n, 850 const DeviceMemory<std::complex<float>> &ap, 851 DeviceMemory<std::complex<float>> *x, int incx) = 0; 852 virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 853 blas::Transpose trans, blas::Diagonal diag, uint64 n, 854 const DeviceMemory<std::complex<double>> &ap, 855 DeviceMemory<std::complex<double>> *x, int incx) = 0; 856 857 // Solves a system of linear equations whose coefficients are in a triangular 858 // packed matrix as below: 859 // 860 // a * x = b, 861 // or 862 // a' * x = b, 863 // or 864 // conj(a') * x = b, 865 // 866 // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or 867 // lower triangular matrix, supplied in packed form. 868 virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 869 blas::Transpose trans, blas::Diagonal diag, uint64 n, 870 const DeviceMemory<float> &ap, DeviceMemory<float> *x, 871 int incx) = 0; 872 virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 873 blas::Transpose trans, blas::Diagonal diag, uint64 n, 874 const DeviceMemory<double> &ap, 875 DeviceMemory<double> *x, int incx) = 0; 876 virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 877 blas::Transpose trans, blas::Diagonal diag, uint64 n, 878 const DeviceMemory<std::complex<float>> &ap, 879 DeviceMemory<std::complex<float>> *x, int incx) = 0; 880 virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 881 blas::Transpose trans, blas::Diagonal diag, uint64 n, 882 const DeviceMemory<std::complex<double>> &ap, 883 DeviceMemory<std::complex<double>> *x, int incx) = 0; 884 885 // Computes a matrix-vector product using a triangular matrix. 886 // 887 // x <- a * x, 888 // or 889 // x <- a' * x, 890 // or 891 // x <- conj(a') * x, 892 // 893 // a is an n-by-n unit, or non-unit, upper or lower triangular matrix; x is a 894 // n-element vector. 895 virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 896 blas::Transpose trans, blas::Diagonal diag, uint64 n, 897 const DeviceMemory<float> &a, int lda, 898 DeviceMemory<float> *x, int incx) = 0; 899 virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 900 blas::Transpose trans, blas::Diagonal diag, uint64 n, 901 const DeviceMemory<double> &a, int lda, 902 DeviceMemory<double> *x, int incx) = 0; 903 virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 904 blas::Transpose trans, blas::Diagonal diag, uint64 n, 905 const DeviceMemory<std::complex<float>> &a, int lda, 906 DeviceMemory<std::complex<float>> *x, int incx) = 0; 907 virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 908 blas::Transpose trans, blas::Diagonal diag, uint64 n, 909 const DeviceMemory<std::complex<double>> &a, int lda, 910 DeviceMemory<std::complex<double>> *x, int incx) = 0; 911 912 // Solves a system of linear equations whose coefficients are in a triangular 913 // matrix as below: 914 // 915 // a * x = b, 916 // or 917 // a' * x = b, 918 // or 919 // conj(a') * x = b, 920 // 921 // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or 922 // lower triangular matrix. 923 virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 924 blas::Transpose trans, blas::Diagonal diag, uint64 n, 925 const DeviceMemory<float> &a, int lda, 926 DeviceMemory<float> *x, int incx) = 0; 927 virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 928 blas::Transpose trans, blas::Diagonal diag, uint64 n, 929 const DeviceMemory<double> &a, int lda, 930 DeviceMemory<double> *x, int incx) = 0; 931 virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 932 blas::Transpose trans, blas::Diagonal diag, uint64 n, 933 const DeviceMemory<std::complex<float>> &a, int lda, 934 DeviceMemory<std::complex<float>> *x, int incx) = 0; 935 virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 936 blas::Transpose trans, blas::Diagonal diag, uint64 n, 937 const DeviceMemory<std::complex<double>> &a, int lda, 938 DeviceMemory<std::complex<double>> *x, int incx) = 0; 939 940 // Computes a matrix-matrix product with general matrices: 941 // 942 // c <- alpha * op(a) * op(b) + beta * c, 943 // 944 // op(X) is one of op(X) = X, or op(X) = X', or op(X) = conj(X'); alpha and 945 // beta are scalars; a, b, and c are matrices; op(a) is an m-by-k matrix; 946 // op(b) is a k-by-n matrix; c is an m-by-n matrix. 947 // 948 // Note: The half interface uses float precision internally; the version 949 // that uses half precision internally is not yet supported. There is no 950 // batched version of the half-precision interface. 951 virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, 952 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 953 float alpha, const DeviceMemory<Eigen::half> &a, 954 int lda, const DeviceMemory<Eigen::half> &b, int ldb, 955 float beta, DeviceMemory<Eigen::half> *c, 956 int ldc) = 0; 957 virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, 958 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 959 float alpha, const DeviceMemory<float> &a, int lda, 960 const DeviceMemory<float> &b, int ldb, float beta, 961 DeviceMemory<float> *c, int ldc) = 0; 962 virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, 963 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 964 double alpha, const DeviceMemory<double> &a, int lda, 965 const DeviceMemory<double> &b, int ldb, double beta, 966 DeviceMemory<double> *c, int ldc) = 0; 967 virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, 968 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 969 std::complex<float> alpha, 970 const DeviceMemory<std::complex<float>> &a, int lda, 971 const DeviceMemory<std::complex<float>> &b, int ldb, 972 std::complex<float> beta, 973 DeviceMemory<std::complex<float>> *c, int ldc) = 0; 974 virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, 975 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 976 std::complex<double> alpha, 977 const DeviceMemory<std::complex<double>> &a, int lda, 978 const DeviceMemory<std::complex<double>> &b, int ldb, 979 std::complex<double> beta, 980 DeviceMemory<std::complex<double>> *c, int ldc) = 0; 981 982 virtual bool DoBlasGemmWithProfiling( 983 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 984 uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, 985 int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta, 986 DeviceMemory<Eigen::half> *c, int ldc, 987 ProfileResult *output_profile_result) = 0; 988 virtual bool DoBlasGemmWithProfiling( 989 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 990 uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda, 991 const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c, 992 int ldc, ProfileResult *output_profile_result) = 0; 993 virtual bool DoBlasGemmWithProfiling( 994 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 995 uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda, 996 const DeviceMemory<double> &b, int ldb, double beta, 997 DeviceMemory<double> *c, int ldc, 998 ProfileResult *output_profile_result) = 0; 999 virtual bool DoBlasGemmWithProfiling( 1000 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1001 uint64 n, uint64 k, std::complex<float> alpha, 1002 const DeviceMemory<std::complex<float>> &a, int lda, 1003 const DeviceMemory<std::complex<float>> &b, int ldb, 1004 std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, 1005 ProfileResult *output_profile_result) = 0; 1006 virtual bool DoBlasGemmWithProfiling( 1007 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1008 uint64 n, uint64 k, std::complex<double> alpha, 1009 const DeviceMemory<std::complex<double>> &a, int lda, 1010 const DeviceMemory<std::complex<double>> &b, int ldb, 1011 std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, 1012 ProfileResult *output_profile_result) = 0; 1013 1014 // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm. 1015 virtual bool GetBlasGemmAlgorithms( 1016 std::vector<AlgorithmType> *out_algorithms) = 0; 1017 1018 // Like DoBlasGemm, but accepts an algorithm and an compute type. 1019 // 1020 // The compute type lets you say (e.g.) that the inputs and outputs are 1021 // Eigen::halfs, but you want the internal computations to be done with 1022 // float32 precision. 1023 // 1024 // Note the subtle difference in the version that accepts Eigen:::half -- 1025 // alpha and beta have type const Eigen::half&, not float. 1026 // 1027 // If output_profile_result is not null, a failure here does not put the 1028 // stream in a failure state. Instead, success/failure is indicated by 1029 // output_profile_result->is_valid(). This lets you use this function for 1030 // choosing the best algorithm among many (some of which may fail) without 1031 // creating a new Stream for each attempt. 1032 virtual bool DoBlasGemmWithAlgorithm( 1033 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1034 uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda, 1035 const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int32> *c, 1036 int ldc, ComputationType computation_type, AlgorithmType algorithm, 1037 ProfileResult *output_profile_result) = 0; 1038 virtual bool DoBlasGemmWithAlgorithm( 1039 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1040 uint64 n, uint64 k, const Eigen::half &alpha, 1041 const DeviceMemory<Eigen::half> &a, int lda, 1042 const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta, 1043 DeviceMemory<Eigen::half> *c, int ldc, ComputationType computation_type, 1044 AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; 1045 virtual bool DoBlasGemmWithAlgorithm( 1046 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1047 uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda, 1048 const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c, 1049 int ldc, ComputationType computation_type, AlgorithmType algorithm, 1050 ProfileResult *output_profile_result) = 0; 1051 virtual bool DoBlasGemmWithAlgorithm( 1052 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1053 uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda, 1054 const DeviceMemory<double> &b, int ldb, double beta, 1055 DeviceMemory<double> *c, int ldc, ComputationType computation_type, 1056 AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; 1057 virtual bool DoBlasGemmWithAlgorithm( 1058 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1059 uint64 n, uint64 k, std::complex<float> alpha, 1060 const DeviceMemory<std::complex<float>> &a, int lda, 1061 const DeviceMemory<std::complex<float>> &b, int ldb, 1062 std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, 1063 ComputationType computation_type, AlgorithmType algorithm, 1064 ProfileResult *output_profile_result) = 0; 1065 virtual bool DoBlasGemmWithAlgorithm( 1066 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1067 uint64 n, uint64 k, std::complex<double> alpha, 1068 const DeviceMemory<std::complex<double>> &a, int lda, 1069 const DeviceMemory<std::complex<double>> &b, int ldb, 1070 std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, 1071 ComputationType computation_type, AlgorithmType algorithm, 1072 ProfileResult *output_profile_result) = 0; 1073 1074 // Computes a batch of matrix-matrix product with general matrices. 1075 // This is a batched version of DoBlasGemm. 1076 // The batched GEMM computes matrix product for each input/output in a, b, 1077 // and c, which contain batch_count DeviceMemory objects. 1078 virtual bool DoBlasGemmBatched( 1079 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1080 uint64 n, uint64 k, float alpha, 1081 const port::ArraySlice<DeviceMemory<float> *> &a, int lda, 1082 const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, 1083 const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, 1084 int batch_count, ScratchAllocator *scratch_allocator) = 0; 1085 virtual bool DoBlasGemmBatched( 1086 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1087 uint64 n, uint64 k, double alpha, 1088 const port::ArraySlice<DeviceMemory<double> *> &a, int lda, 1089 const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, 1090 const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, 1091 int batch_count, ScratchAllocator *scratch_allocator) = 0; 1092 virtual bool DoBlasGemmBatched( 1093 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1094 uint64 n, uint64 k, std::complex<float> alpha, 1095 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, 1096 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, 1097 std::complex<float> beta, 1098 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, 1099 int batch_count, ScratchAllocator *scratch_allocator) = 0; 1100 virtual bool DoBlasGemmBatched( 1101 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1102 uint64 n, uint64 k, std::complex<double> alpha, 1103 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda, 1104 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, 1105 std::complex<double> beta, 1106 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, 1107 int batch_count, ScratchAllocator *scratch_allocator) = 0; 1108 1109 // Computes a matrix-matrix product where one input matrix is Hermitian: 1110 // 1111 // c <- alpha * a * b + beta * c, 1112 // or 1113 // c <- alpha * b * a + beta * c, 1114 // 1115 // alpha and beta are scalars; a is a Hermitian matrix; b and c are m-by-n 1116 // matrices. 1117 virtual bool DoBlasHemm(Stream *stream, blas::Side side, 1118 blas::UpperLower uplo, uint64 m, uint64 n, 1119 std::complex<float> alpha, 1120 const DeviceMemory<std::complex<float>> &a, int lda, 1121 const DeviceMemory<std::complex<float>> &b, int ldb, 1122 std::complex<float> beta, 1123 DeviceMemory<std::complex<float>> *c, int ldc) = 0; 1124 virtual bool DoBlasHemm(Stream *stream, blas::Side side, 1125 blas::UpperLower uplo, uint64 m, uint64 n, 1126 std::complex<double> alpha, 1127 const DeviceMemory<std::complex<double>> &a, int lda, 1128 const DeviceMemory<std::complex<double>> &b, int ldb, 1129 std::complex<double> beta, 1130 DeviceMemory<std::complex<double>> *c, int ldc) = 0; 1131 1132 // Performs a Hermitian rank-k update. 1133 // 1134 // c <- alpha * a * conj(a') + beta * c, 1135 // or 1136 // c <- alpha * conj(a') * a + beta * c, 1137 // 1138 // alpha and beta are scalars; c is a n-by-n Hermitian matrix; a is an n-by-k 1139 // matrix in the first case and a k-by-n matrix in the second case. 1140 virtual bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, 1141 blas::Transpose trans, uint64 n, uint64 k, 1142 float alpha, 1143 const DeviceMemory<std::complex<float>> &a, int lda, 1144 float beta, DeviceMemory<std::complex<float>> *c, 1145 int ldc) = 0; 1146 virtual bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, 1147 blas::Transpose trans, uint64 n, uint64 k, 1148 double alpha, 1149 const DeviceMemory<std::complex<double>> &a, int lda, 1150 double beta, DeviceMemory<std::complex<double>> *c, 1151 int ldc) = 0; 1152 1153 // Performs a Hermitian rank-2k update. 1154 // 1155 // c <- alpha * a * conj(b') + conj(alpha) * b * conj(a') + beta * c, 1156 // or 1157 // c <- alpha * conj(b') * a + conj(alpha) * conj(a') * b + beta * c, 1158 // 1159 // alpha and beta are scalars; c is a n-by-n Hermitian matrix; a and b are 1160 // n-by-k matrices in the first case and k-by-n matrices in the second case. 1161 virtual bool DoBlasHer2k(Stream *stream, blas::UpperLower uplo, 1162 blas::Transpose trans, uint64 n, uint64 k, 1163 std::complex<float> alpha, 1164 const DeviceMemory<std::complex<float>> &a, int lda, 1165 const DeviceMemory<std::complex<float>> &b, int ldb, 1166 float beta, DeviceMemory<std::complex<float>> *c, 1167 int ldc) = 0; 1168 virtual bool DoBlasHer2k(Stream *stream, blas::UpperLower uplo, 1169 blas::Transpose trans, uint64 n, uint64 k, 1170 std::complex<double> alpha, 1171 const DeviceMemory<std::complex<double>> &a, int lda, 1172 const DeviceMemory<std::complex<double>> &b, int ldb, 1173 double beta, DeviceMemory<std::complex<double>> *c, 1174 int ldc) = 0; 1175 1176 // Computes a matrix-matrix product where one input matrix is symmetric. 1177 // 1178 // c <- alpha * a * b + beta * c, 1179 // or 1180 // c <- alpha * b * a + beta * c, 1181 // 1182 // alpha and beta are scalars; a is a symmetric matrix; b and c are m-by-n 1183 // matrices. 1184 virtual bool DoBlasSymm(Stream *stream, blas::Side side, 1185 blas::UpperLower uplo, uint64 m, uint64 n, 1186 float alpha, const DeviceMemory<float> &a, int lda, 1187 const DeviceMemory<float> &b, int ldb, float beta, 1188 DeviceMemory<float> *c, int ldc) = 0; 1189 virtual bool DoBlasSymm(Stream *stream, blas::Side side, 1190 blas::UpperLower uplo, uint64 m, uint64 n, 1191 double alpha, const DeviceMemory<double> &a, int lda, 1192 const DeviceMemory<double> &b, int ldb, double beta, 1193 DeviceMemory<double> *c, int ldc) = 0; 1194 virtual bool DoBlasSymm(Stream *stream, blas::Side side, 1195 blas::UpperLower uplo, uint64 m, uint64 n, 1196 std::complex<float> alpha, 1197 const DeviceMemory<std::complex<float>> &a, int lda, 1198 const DeviceMemory<std::complex<float>> &b, int ldb, 1199 std::complex<float> beta, 1200 DeviceMemory<std::complex<float>> *c, int ldc) = 0; 1201 virtual bool DoBlasSymm(Stream *stream, blas::Side side, 1202 blas::UpperLower uplo, uint64 m, uint64 n, 1203 std::complex<double> alpha, 1204 const DeviceMemory<std::complex<double>> &a, int lda, 1205 const DeviceMemory<std::complex<double>> &b, int ldb, 1206 std::complex<double> beta, 1207 DeviceMemory<std::complex<double>> *c, int ldc) = 0; 1208 1209 // Performs a symmetric rank-k update. 1210 // 1211 // c <- alpha * a * a' + beta * c, 1212 // or 1213 // c <- alpha * a' * a + beta * c, 1214 // 1215 // alpha and beta are scalars; c is a n-by-n symmetric matrix; a is an n-by-k 1216 // matrix in the first case and a k-by-n matrix in the second case. 1217 virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 1218 blas::Transpose trans, uint64 n, uint64 k, 1219 float alpha, const DeviceMemory<float> &a, int lda, 1220 float beta, DeviceMemory<float> *c, int ldc) = 0; 1221 virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 1222 blas::Transpose trans, uint64 n, uint64 k, 1223 double alpha, const DeviceMemory<double> &a, int lda, 1224 double beta, DeviceMemory<double> *c, int ldc) = 0; 1225 virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 1226 blas::Transpose trans, uint64 n, uint64 k, 1227 std::complex<float> alpha, 1228 const DeviceMemory<std::complex<float>> &a, int lda, 1229 std::complex<float> beta, 1230 DeviceMemory<std::complex<float>> *c, int ldc) = 0; 1231 virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 1232 blas::Transpose trans, uint64 n, uint64 k, 1233 std::complex<double> alpha, 1234 const DeviceMemory<std::complex<double>> &a, int lda, 1235 std::complex<double> beta, 1236 DeviceMemory<std::complex<double>> *c, int ldc) = 0; 1237 1238 // Performs a symmetric rank-2k update. 1239 // 1240 // c <- alpha * a * b' + alpha * b * a' + beta * c, 1241 // or 1242 // c <- alpha * b' * a + alpha * a' * b + beta * c, 1243 // 1244 // alpha and beta are scalars; c is a n-by-n symmetric matrix; a and b are 1245 // n-by-k matrices in the first case and k-by-n matrices in the second case. 1246 virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 1247 blas::Transpose trans, uint64 n, uint64 k, 1248 float alpha, const DeviceMemory<float> &a, int lda, 1249 const DeviceMemory<float> &b, int ldb, float beta, 1250 DeviceMemory<float> *c, int ldc) = 0; 1251 virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 1252 blas::Transpose trans, uint64 n, uint64 k, 1253 double alpha, const DeviceMemory<double> &a, int lda, 1254 const DeviceMemory<double> &b, int ldb, double beta, 1255 DeviceMemory<double> *c, int ldc) = 0; 1256 virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 1257 blas::Transpose trans, uint64 n, uint64 k, 1258 std::complex<float> alpha, 1259 const DeviceMemory<std::complex<float>> &a, int lda, 1260 const DeviceMemory<std::complex<float>> &b, int ldb, 1261 std::complex<float> beta, 1262 DeviceMemory<std::complex<float>> *c, int ldc) = 0; 1263 virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 1264 blas::Transpose trans, uint64 n, uint64 k, 1265 std::complex<double> alpha, 1266 const DeviceMemory<std::complex<double>> &a, int lda, 1267 const DeviceMemory<std::complex<double>> &b, int ldb, 1268 std::complex<double> beta, 1269 DeviceMemory<std::complex<double>> *c, int ldc) = 0; 1270 1271 // Computes a matrix-matrix product where one input matrix is triangular. 1272 // 1273 // b <- alpha * op(a) * b, 1274 // or 1275 // b <- alpha * b * op(a) 1276 // 1277 // alpha is a scalar; b is an m-by-n matrix; a is a unit, or non-unit, upper 1278 // or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a', or 1279 // op(a) = conj(a'). 1280 virtual bool DoBlasTrmm(Stream *stream, blas::Side side, 1281 blas::UpperLower uplo, blas::Transpose transa, 1282 blas::Diagonal diag, uint64 m, uint64 n, float alpha, 1283 const DeviceMemory<float> &a, int lda, 1284 DeviceMemory<float> *b, int ldb) = 0; 1285 virtual bool DoBlasTrmm(Stream *stream, blas::Side side, 1286 blas::UpperLower uplo, blas::Transpose transa, 1287 blas::Diagonal diag, uint64 m, uint64 n, double alpha, 1288 const DeviceMemory<double> &a, int lda, 1289 DeviceMemory<double> *b, int ldb) = 0; 1290 virtual bool DoBlasTrmm(Stream *stream, blas::Side side, 1291 blas::UpperLower uplo, blas::Transpose transa, 1292 blas::Diagonal diag, uint64 m, uint64 n, 1293 std::complex<float> alpha, 1294 const DeviceMemory<std::complex<float>> &a, int lda, 1295 DeviceMemory<std::complex<float>> *b, int ldb) = 0; 1296 virtual bool DoBlasTrmm(Stream *stream, blas::Side side, 1297 blas::UpperLower uplo, blas::Transpose transa, 1298 blas::Diagonal diag, uint64 m, uint64 n, 1299 std::complex<double> alpha, 1300 const DeviceMemory<std::complex<double>> &a, int lda, 1301 DeviceMemory<std::complex<double>> *b, int ldb) = 0; 1302 1303 // Solves a triangular matrix equation. 1304 // 1305 // op(a) * x = alpha * b, 1306 // or 1307 // x * op(a) = alpha * b 1308 // 1309 // alpha is a scalar; x and b are m-by-n matrices; a is a unit, or non-unit, 1310 // upper or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a', 1311 // or op(a) = conj(a'). 1312 virtual bool DoBlasTrsm(Stream *stream, blas::Side side, 1313 blas::UpperLower uplo, blas::Transpose transa, 1314 blas::Diagonal diag, uint64 m, uint64 n, float alpha, 1315 const DeviceMemory<float> &a, int lda, 1316 DeviceMemory<float> *b, int ldb) = 0; 1317 virtual bool DoBlasTrsm(Stream *stream, blas::Side side, 1318 blas::UpperLower uplo, blas::Transpose transa, 1319 blas::Diagonal diag, uint64 m, uint64 n, double alpha, 1320 const DeviceMemory<double> &a, int lda, 1321 DeviceMemory<double> *b, int ldb) = 0; 1322 virtual bool DoBlasTrsm(Stream *stream, blas::Side side, 1323 blas::UpperLower uplo, blas::Transpose transa, 1324 blas::Diagonal diag, uint64 m, uint64 n, 1325 std::complex<float> alpha, 1326 const DeviceMemory<std::complex<float>> &a, int lda, 1327 DeviceMemory<std::complex<float>> *b, int ldb) = 0; 1328 virtual bool DoBlasTrsm(Stream *stream, blas::Side side, 1329 blas::UpperLower uplo, blas::Transpose transa, 1330 blas::Diagonal diag, uint64 m, uint64 n, 1331 std::complex<double> alpha, 1332 const DeviceMemory<std::complex<double>> &a, int lda, 1333 DeviceMemory<std::complex<double>> *b, int ldb) = 0; 1334 1335 protected: 1336 BlasSupport() {} 1337 1338 private: 1339 SE_DISALLOW_COPY_AND_ASSIGN(BlasSupport); 1340 }; 1341 1342 // Macro used to quickly declare overrides for abstract virtuals in the 1343 // BlasSupport base class. 1344 #define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES \ 1345 bool DoBlasAsum(Stream *stream, uint64 elem_count, \ 1346 const DeviceMemory<float> &x, int incx, \ 1347 DeviceMemory<float> *result) override; \ 1348 bool DoBlasAsum(Stream *stream, uint64 elem_count, \ 1349 const DeviceMemory<double> &x, int incx, \ 1350 DeviceMemory<double> *result) override; \ 1351 bool DoBlasAsum(Stream *stream, uint64 elem_count, \ 1352 const DeviceMemory<std::complex<float>> &x, int incx, \ 1353 DeviceMemory<float> *result) override; \ 1354 bool DoBlasAsum(Stream *stream, uint64 elem_count, \ 1355 const DeviceMemory<std::complex<double>> &x, int incx, \ 1356 DeviceMemory<double> *result) override; \ 1357 bool DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, \ 1358 const DeviceMemory<float> &x, int incx, \ 1359 DeviceMemory<float> *y, int incy) override; \ 1360 bool DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, \ 1361 const DeviceMemory<double> &x, int incx, \ 1362 DeviceMemory<double> *y, int incy) override; \ 1363 bool DoBlasAxpy(Stream *stream, uint64 elem_count, \ 1364 std::complex<float> alpha, \ 1365 const DeviceMemory<std::complex<float>> &x, int incx, \ 1366 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1367 bool DoBlasAxpy(Stream *stream, uint64 elem_count, \ 1368 std::complex<double> alpha, \ 1369 const DeviceMemory<std::complex<double>> &x, int incx, \ 1370 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1371 bool DoBlasCopy(Stream *stream, uint64 elem_count, \ 1372 const DeviceMemory<float> &x, int incx, \ 1373 DeviceMemory<float> *y, int incy) override; \ 1374 bool DoBlasCopy(Stream *stream, uint64 elem_count, \ 1375 const DeviceMemory<double> &x, int incx, \ 1376 DeviceMemory<double> *y, int incy) override; \ 1377 bool DoBlasCopy(Stream *stream, uint64 elem_count, \ 1378 const DeviceMemory<std::complex<float>> &x, int incx, \ 1379 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1380 bool DoBlasCopy(Stream *stream, uint64 elem_count, \ 1381 const DeviceMemory<std::complex<double>> &x, int incx, \ 1382 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1383 bool DoBlasDot(Stream *stream, uint64 elem_count, \ 1384 const DeviceMemory<float> &x, int incx, \ 1385 const DeviceMemory<float> &y, int incy, \ 1386 DeviceMemory<float> *result) override; \ 1387 bool DoBlasDot(Stream *stream, uint64 elem_count, \ 1388 const DeviceMemory<double> &x, int incx, \ 1389 const DeviceMemory<double> &y, int incy, \ 1390 DeviceMemory<double> *result) override; \ 1391 bool DoBlasDotc(Stream *stream, uint64 elem_count, \ 1392 const DeviceMemory<std::complex<float>> &x, int incx, \ 1393 const DeviceMemory<std::complex<float>> &y, int incy, \ 1394 DeviceMemory<std::complex<float>> *result) override; \ 1395 bool DoBlasDotc(Stream *stream, uint64 elem_count, \ 1396 const DeviceMemory<std::complex<double>> &x, int incx, \ 1397 const DeviceMemory<std::complex<double>> &y, int incy, \ 1398 DeviceMemory<std::complex<double>> *result) override; \ 1399 bool DoBlasDotu(Stream *stream, uint64 elem_count, \ 1400 const DeviceMemory<std::complex<float>> &x, int incx, \ 1401 const DeviceMemory<std::complex<float>> &y, int incy, \ 1402 DeviceMemory<std::complex<float>> *result) override; \ 1403 bool DoBlasDotu(Stream *stream, uint64 elem_count, \ 1404 const DeviceMemory<std::complex<double>> &x, int incx, \ 1405 const DeviceMemory<std::complex<double>> &y, int incy, \ 1406 DeviceMemory<std::complex<double>> *result) override; \ 1407 bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ 1408 const DeviceMemory<float> &x, int incx, \ 1409 DeviceMemory<float> *result) override; \ 1410 bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ 1411 const DeviceMemory<double> &x, int incx, \ 1412 DeviceMemory<double> *result) override; \ 1413 bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ 1414 const DeviceMemory<std::complex<float>> &x, int incx, \ 1415 DeviceMemory<float> *result) override; \ 1416 bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ 1417 const DeviceMemory<std::complex<double>> &x, int incx, \ 1418 DeviceMemory<double> *result) override; \ 1419 bool DoBlasRot(Stream *stream, uint64 elem_count, DeviceMemory<float> *x, \ 1420 int incx, DeviceMemory<float> *y, int incy, float c, float s) \ 1421 override; \ 1422 bool DoBlasRot(Stream *stream, uint64 elem_count, DeviceMemory<double> *x, \ 1423 int incx, DeviceMemory<double> *y, int incy, double c, \ 1424 double s) override; \ 1425 bool DoBlasRot(Stream *stream, uint64 elem_count, \ 1426 DeviceMemory<std::complex<float>> *x, int incx, \ 1427 DeviceMemory<std::complex<float>> *y, int incy, float c, \ 1428 float s) override; \ 1429 bool DoBlasRot(Stream *stream, uint64 elem_count, \ 1430 DeviceMemory<std::complex<double>> *x, int incx, \ 1431 DeviceMemory<std::complex<double>> *y, int incy, double c, \ 1432 double s) override; \ 1433 bool DoBlasRotg(Stream *stream, DeviceMemory<float> *a, \ 1434 DeviceMemory<float> *b, DeviceMemory<float> *c, \ 1435 DeviceMemory<float> *s) override; \ 1436 bool DoBlasRotg(Stream *stream, DeviceMemory<double> *a, \ 1437 DeviceMemory<double> *b, DeviceMemory<double> *c, \ 1438 DeviceMemory<double> *s) override; \ 1439 bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, \ 1440 DeviceMemory<std::complex<float>> *b, \ 1441 DeviceMemory<float> *c, \ 1442 DeviceMemory<std::complex<float>> *s) override; \ 1443 bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, \ 1444 DeviceMemory<std::complex<double>> *b, \ 1445 DeviceMemory<double> *c, \ 1446 DeviceMemory<std::complex<double>> *s) override; \ 1447 bool DoBlasRotm(Stream *stream, uint64 elem_count, DeviceMemory<float> *x, \ 1448 int incx, DeviceMemory<float> *y, int incy, \ 1449 const DeviceMemory<float> ¶m) override; \ 1450 bool DoBlasRotm(Stream *stream, uint64 elem_count, DeviceMemory<double> *x, \ 1451 int incx, DeviceMemory<double> *y, int incy, \ 1452 const DeviceMemory<double> ¶m) override; \ 1453 bool DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, \ 1454 DeviceMemory<float> *d2, DeviceMemory<float> *x1, \ 1455 const DeviceMemory<float> &y1, DeviceMemory<float> *param) \ 1456 override; \ 1457 bool DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, \ 1458 DeviceMemory<double> *d2, DeviceMemory<double> *x1, \ 1459 const DeviceMemory<double> &y1, \ 1460 DeviceMemory<double> *param) override; \ 1461 bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, \ 1462 DeviceMemory<float> *x, int incx) override; \ 1463 bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, \ 1464 DeviceMemory<double> *x, int incx) override; \ 1465 bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, \ 1466 DeviceMemory<std::complex<float>> *x, int incx) override; \ 1467 bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, \ 1468 DeviceMemory<std::complex<double>> *x, int incx) override; \ 1469 bool DoBlasScal(Stream *stream, uint64 elem_count, \ 1470 std::complex<float> alpha, \ 1471 DeviceMemory<std::complex<float>> *x, int incx) override; \ 1472 bool DoBlasScal(Stream *stream, uint64 elem_count, \ 1473 std::complex<double> alpha, \ 1474 DeviceMemory<std::complex<double>> *x, int incx) override; \ 1475 bool DoBlasSwap(Stream *stream, uint64 elem_count, DeviceMemory<float> *x, \ 1476 int incx, DeviceMemory<float> *y, int incy) override; \ 1477 bool DoBlasSwap(Stream *stream, uint64 elem_count, DeviceMemory<double> *x, \ 1478 int incx, DeviceMemory<double> *y, int incy) override; \ 1479 bool DoBlasSwap(Stream *stream, uint64 elem_count, \ 1480 DeviceMemory<std::complex<float>> *x, int incx, \ 1481 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1482 bool DoBlasSwap(Stream *stream, uint64 elem_count, \ 1483 DeviceMemory<std::complex<double>> *x, int incx, \ 1484 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1485 bool DoBlasIamax(Stream *stream, uint64 elem_count, \ 1486 const DeviceMemory<float> &x, int incx, \ 1487 DeviceMemory<int> *result) override; \ 1488 bool DoBlasIamax(Stream *stream, uint64 elem_count, \ 1489 const DeviceMemory<double> &x, int incx, \ 1490 DeviceMemory<int> *result) override; \ 1491 bool DoBlasIamax(Stream *stream, uint64 elem_count, \ 1492 const DeviceMemory<std::complex<float>> &x, int incx, \ 1493 DeviceMemory<int> *result) override; \ 1494 bool DoBlasIamax(Stream *stream, uint64 elem_count, \ 1495 const DeviceMemory<std::complex<double>> &x, int incx, \ 1496 DeviceMemory<int> *result) override; \ 1497 bool DoBlasIamin(Stream *stream, uint64 elem_count, \ 1498 const DeviceMemory<float> &x, int incx, \ 1499 DeviceMemory<int> *result) override; \ 1500 bool DoBlasIamin(Stream *stream, uint64 elem_count, \ 1501 const DeviceMemory<double> &x, int incx, \ 1502 DeviceMemory<int> *result) override; \ 1503 bool DoBlasIamin(Stream *stream, uint64 elem_count, \ 1504 const DeviceMemory<std::complex<float>> &x, int incx, \ 1505 DeviceMemory<int> *result) override; \ 1506 bool DoBlasIamin(Stream *stream, uint64 elem_count, \ 1507 const DeviceMemory<std::complex<double>> &x, int incx, \ 1508 DeviceMemory<int> *result) override; \ 1509 bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1510 uint64 kl, uint64 ku, float alpha, \ 1511 const DeviceMemory<float> &a, int lda, \ 1512 const DeviceMemory<float> &x, int incx, float beta, \ 1513 DeviceMemory<float> *y, int incy) override; \ 1514 bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1515 uint64 kl, uint64 ku, double alpha, \ 1516 const DeviceMemory<double> &a, int lda, \ 1517 const DeviceMemory<double> &x, int incx, double beta, \ 1518 DeviceMemory<double> *y, int incy) override; \ 1519 bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1520 uint64 kl, uint64 ku, std::complex<float> alpha, \ 1521 const DeviceMemory<std::complex<float>> &a, int lda, \ 1522 const DeviceMemory<std::complex<float>> &x, int incx, \ 1523 std::complex<float> beta, \ 1524 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1525 bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1526 uint64 kl, uint64 ku, std::complex<double> alpha, \ 1527 const DeviceMemory<std::complex<double>> &a, int lda, \ 1528 const DeviceMemory<std::complex<double>> &x, int incx, \ 1529 std::complex<double> beta, \ 1530 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1531 bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1532 float alpha, const DeviceMemory<float> &a, int lda, \ 1533 const DeviceMemory<float> &x, int incx, float beta, \ 1534 DeviceMemory<float> *y, int incy) override; \ 1535 bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1536 double alpha, const DeviceMemory<double> &a, int lda, \ 1537 const DeviceMemory<double> &x, int incx, double beta, \ 1538 DeviceMemory<double> *y, int incy) override; \ 1539 bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1540 std::complex<float> alpha, \ 1541 const DeviceMemory<std::complex<float>> &a, int lda, \ 1542 const DeviceMemory<std::complex<float>> &x, int incx, \ 1543 std::complex<float> beta, \ 1544 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1545 bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1546 std::complex<double> alpha, \ 1547 const DeviceMemory<std::complex<double>> &a, int lda, \ 1548 const DeviceMemory<std::complex<double>> &x, int incx, \ 1549 std::complex<double> beta, \ 1550 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1551 bool DoBlasGemvWithProfiling( \ 1552 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha, \ 1553 const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, \ 1554 int incx, float beta, DeviceMemory<float> *y, int incy, \ 1555 blas::ProfileResult *output_profile_result) override; \ 1556 bool DoBlasGemvWithProfiling( \ 1557 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, \ 1558 const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, \ 1559 int incx, double beta, DeviceMemory<double> *y, int incy, \ 1560 blas::ProfileResult *output_profile_result) override; \ 1561 bool DoBlasGemvWithProfiling( \ 1562 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1563 std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a, \ 1564 int lda, const DeviceMemory<std::complex<float>> &x, int incx, \ 1565 std::complex<float> beta, DeviceMemory<std::complex<float>> *y, \ 1566 int incy, blas::ProfileResult *output_profile_result) override; \ 1567 bool DoBlasGemvWithProfiling( \ 1568 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ 1569 std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, \ 1570 int lda, const DeviceMemory<std::complex<double>> &x, int incx, \ 1571 std::complex<double> beta, DeviceMemory<std::complex<double>> *y, \ 1572 int incy, blas::ProfileResult *output_profile_result) override; \ 1573 bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, \ 1574 const DeviceMemory<float> &x, int incx, \ 1575 const DeviceMemory<float> &y, int incy, \ 1576 DeviceMemory<float> *a, int lda) override; \ 1577 bool DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, \ 1578 const DeviceMemory<double> &x, int incx, \ 1579 const DeviceMemory<double> &y, int incy, \ 1580 DeviceMemory<double> *a, int lda) override; \ 1581 bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, \ 1582 std::complex<float> alpha, \ 1583 const DeviceMemory<std::complex<float>> &x, int incx, \ 1584 const DeviceMemory<std::complex<float>> &y, int incy, \ 1585 DeviceMemory<std::complex<float>> *a, int lda) override; \ 1586 bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, \ 1587 std::complex<double> alpha, \ 1588 const DeviceMemory<std::complex<double>> &x, int incx, \ 1589 const DeviceMemory<std::complex<double>> &y, int incy, \ 1590 DeviceMemory<std::complex<double>> *a, int lda) override; \ 1591 bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, \ 1592 std::complex<float> alpha, \ 1593 const DeviceMemory<std::complex<float>> &x, int incx, \ 1594 const DeviceMemory<std::complex<float>> &y, int incy, \ 1595 DeviceMemory<std::complex<float>> *a, int lda) override; \ 1596 bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, \ 1597 std::complex<double> alpha, \ 1598 const DeviceMemory<std::complex<double>> &x, int incx, \ 1599 const DeviceMemory<std::complex<double>> &y, int incy, \ 1600 DeviceMemory<std::complex<double>> *a, int lda) override; \ 1601 bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ 1602 std::complex<float> alpha, \ 1603 const DeviceMemory<std::complex<float>> &a, int lda, \ 1604 const DeviceMemory<std::complex<float>> &x, int incx, \ 1605 std::complex<float> beta, \ 1606 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1607 bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ 1608 std::complex<double> alpha, \ 1609 const DeviceMemory<std::complex<double>> &a, int lda, \ 1610 const DeviceMemory<std::complex<double>> &x, int incx, \ 1611 std::complex<double> beta, \ 1612 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1613 bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1614 std::complex<float> alpha, \ 1615 const DeviceMemory<std::complex<float>> &a, int lda, \ 1616 const DeviceMemory<std::complex<float>> &x, int incx, \ 1617 std::complex<float> beta, \ 1618 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1619 bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1620 std::complex<double> alpha, \ 1621 const DeviceMemory<std::complex<double>> &a, int lda, \ 1622 const DeviceMemory<std::complex<double>> &x, int incx, \ 1623 std::complex<double> beta, \ 1624 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1625 bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ 1626 const DeviceMemory<std::complex<float>> &x, int incx, \ 1627 DeviceMemory<std::complex<float>> *a, int lda) override; \ 1628 bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1629 double alpha, const DeviceMemory<std::complex<double>> &x, \ 1630 int incx, DeviceMemory<std::complex<double>> *a, int lda) \ 1631 override; \ 1632 bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1633 std::complex<float> alpha, \ 1634 const DeviceMemory<std::complex<float>> &x, int incx, \ 1635 const DeviceMemory<std::complex<float>> &y, int incy, \ 1636 DeviceMemory<std::complex<float>> *a, int lda) override; \ 1637 bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1638 std::complex<double> alpha, \ 1639 const DeviceMemory<std::complex<double>> &x, int incx, \ 1640 const DeviceMemory<std::complex<double>> &y, int incy, \ 1641 DeviceMemory<std::complex<double>> *a, int lda) override; \ 1642 bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1643 std::complex<float> alpha, \ 1644 const DeviceMemory<std::complex<float>> &ap, \ 1645 const DeviceMemory<std::complex<float>> &x, int incx, \ 1646 std::complex<float> beta, \ 1647 DeviceMemory<std::complex<float>> *y, int incy) override; \ 1648 bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1649 std::complex<double> alpha, \ 1650 const DeviceMemory<std::complex<double>> &ap, \ 1651 const DeviceMemory<std::complex<double>> &x, int incx, \ 1652 std::complex<double> beta, \ 1653 DeviceMemory<std::complex<double>> *y, int incy) override; \ 1654 bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ 1655 const DeviceMemory<std::complex<float>> &x, int incx, \ 1656 DeviceMemory<std::complex<float>> *ap) override; \ 1657 bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1658 double alpha, const DeviceMemory<std::complex<double>> &x, \ 1659 int incx, DeviceMemory<std::complex<double>> *ap) override; \ 1660 bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1661 std::complex<float> alpha, \ 1662 const DeviceMemory<std::complex<float>> &x, int incx, \ 1663 const DeviceMemory<std::complex<float>> &y, int incy, \ 1664 DeviceMemory<std::complex<float>> *ap) override; \ 1665 bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1666 std::complex<double> alpha, \ 1667 const DeviceMemory<std::complex<double>> &x, int incx, \ 1668 const DeviceMemory<std::complex<double>> &y, int incy, \ 1669 DeviceMemory<std::complex<double>> *ap) override; \ 1670 bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ 1671 float alpha, const DeviceMemory<float> &a, int lda, \ 1672 const DeviceMemory<float> &x, int incx, float beta, \ 1673 DeviceMemory<float> *y, int incy) override; \ 1674 bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ 1675 double alpha, const DeviceMemory<double> &a, int lda, \ 1676 const DeviceMemory<double> &x, int incx, double beta, \ 1677 DeviceMemory<double> *y, int incy) override; \ 1678 bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1679 float alpha, const DeviceMemory<float> &ap, \ 1680 const DeviceMemory<float> &x, int incx, float beta, \ 1681 DeviceMemory<float> *y, int incy) override; \ 1682 bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1683 double alpha, const DeviceMemory<double> &ap, \ 1684 const DeviceMemory<double> &x, int incx, double beta, \ 1685 DeviceMemory<double> *y, int incy) override; \ 1686 bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ 1687 const DeviceMemory<float> &x, int incx, \ 1688 DeviceMemory<float> *ap) override; \ 1689 bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1690 double alpha, const DeviceMemory<double> &x, int incx, \ 1691 DeviceMemory<double> *ap) override; \ 1692 bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1693 float alpha, const DeviceMemory<float> &x, int incx, \ 1694 const DeviceMemory<float> &y, int incy, \ 1695 DeviceMemory<float> *ap) override; \ 1696 bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1697 double alpha, const DeviceMemory<double> &x, int incx, \ 1698 const DeviceMemory<double> &y, int incy, \ 1699 DeviceMemory<double> *ap) override; \ 1700 bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1701 float alpha, const DeviceMemory<float> &a, int lda, \ 1702 const DeviceMemory<float> &x, int incx, float beta, \ 1703 DeviceMemory<float> *y, int incy) override; \ 1704 bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1705 double alpha, const DeviceMemory<double> &a, int lda, \ 1706 const DeviceMemory<double> &x, int incx, double beta, \ 1707 DeviceMemory<double> *y, int incy) override; \ 1708 bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ 1709 const DeviceMemory<float> &x, int incx, \ 1710 DeviceMemory<float> *a, int lda) override; \ 1711 bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1712 double alpha, const DeviceMemory<double> &x, int incx, \ 1713 DeviceMemory<double> *a, int lda) override; \ 1714 bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1715 float alpha, const DeviceMemory<float> &x, int incx, \ 1716 const DeviceMemory<float> &y, int incy, \ 1717 DeviceMemory<float> *a, int lda) override; \ 1718 bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ 1719 double alpha, const DeviceMemory<double> &x, int incx, \ 1720 const DeviceMemory<double> &y, int incy, \ 1721 DeviceMemory<double> *a, int lda) override; \ 1722 bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ 1723 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1724 uint64 k, const DeviceMemory<float> &a, int lda, \ 1725 DeviceMemory<float> *x, int incx) override; \ 1726 bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ 1727 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1728 uint64 k, const DeviceMemory<double> &a, int lda, \ 1729 DeviceMemory<double> *x, int incx) override; \ 1730 bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ 1731 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1732 uint64 k, const DeviceMemory<std::complex<float>> &a, \ 1733 int lda, DeviceMemory<std::complex<float>> *x, int incx) \ 1734 override; \ 1735 bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ 1736 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1737 uint64 k, const DeviceMemory<std::complex<double>> &a, \ 1738 int lda, DeviceMemory<std::complex<double>> *x, int incx) \ 1739 override; \ 1740 bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ 1741 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1742 uint64 k, const DeviceMemory<float> &a, int lda, \ 1743 DeviceMemory<float> *x, int incx) override; \ 1744 bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ 1745 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1746 uint64 k, const DeviceMemory<double> &a, int lda, \ 1747 DeviceMemory<double> *x, int incx) override; \ 1748 bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ 1749 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1750 uint64 k, const DeviceMemory<std::complex<float>> &a, \ 1751 int lda, DeviceMemory<std::complex<float>> *x, int incx) \ 1752 override; \ 1753 bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ 1754 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1755 uint64 k, const DeviceMemory<std::complex<double>> &a, \ 1756 int lda, DeviceMemory<std::complex<double>> *x, int incx) \ 1757 override; \ 1758 bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ 1759 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1760 const DeviceMemory<float> &ap, DeviceMemory<float> *x, \ 1761 int incx) override; \ 1762 bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ 1763 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1764 const DeviceMemory<double> &ap, DeviceMemory<double> *x, \ 1765 int incx) override; \ 1766 bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ 1767 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1768 const DeviceMemory<std::complex<float>> &ap, \ 1769 DeviceMemory<std::complex<float>> *x, int incx) override; \ 1770 bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ 1771 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1772 const DeviceMemory<std::complex<double>> &ap, \ 1773 DeviceMemory<std::complex<double>> *x, int incx) override; \ 1774 bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ 1775 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1776 const DeviceMemory<float> &ap, DeviceMemory<float> *x, \ 1777 int incx) override; \ 1778 bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ 1779 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1780 const DeviceMemory<double> &ap, DeviceMemory<double> *x, \ 1781 int incx) override; \ 1782 bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ 1783 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1784 const DeviceMemory<std::complex<float>> &ap, \ 1785 DeviceMemory<std::complex<float>> *x, int incx) override; \ 1786 bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ 1787 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1788 const DeviceMemory<std::complex<double>> &ap, \ 1789 DeviceMemory<std::complex<double>> *x, int incx) override; \ 1790 bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ 1791 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1792 const DeviceMemory<float> &a, int lda, \ 1793 DeviceMemory<float> *x, int incx) override; \ 1794 bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ 1795 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1796 const DeviceMemory<double> &a, int lda, \ 1797 DeviceMemory<double> *x, int incx) override; \ 1798 bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ 1799 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1800 const DeviceMemory<std::complex<float>> &a, int lda, \ 1801 DeviceMemory<std::complex<float>> *x, int incx) override; \ 1802 bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ 1803 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1804 const DeviceMemory<std::complex<double>> &a, int lda, \ 1805 DeviceMemory<std::complex<double>> *x, int incx) override; \ 1806 bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ 1807 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1808 const DeviceMemory<float> &a, int lda, \ 1809 DeviceMemory<float> *x, int incx) override; \ 1810 bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ 1811 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1812 const DeviceMemory<double> &a, int lda, \ 1813 DeviceMemory<double> *x, int incx) override; \ 1814 bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ 1815 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1816 const DeviceMemory<std::complex<float>> &a, int lda, \ 1817 DeviceMemory<std::complex<float>> *x, int incx) override; \ 1818 bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ 1819 blas::Transpose trans, blas::Diagonal diag, uint64 n, \ 1820 const DeviceMemory<std::complex<double>> &a, int lda, \ 1821 DeviceMemory<std::complex<double>> *x, int incx) override; \ 1822 bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ 1823 blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ 1824 float alpha, const DeviceMemory<Eigen::half> &a, int lda, \ 1825 const DeviceMemory<Eigen::half> &b, int ldb, float beta, \ 1826 DeviceMemory<Eigen::half> *c, int ldc) override; \ 1827 bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ 1828 blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ 1829 float alpha, const DeviceMemory<float> &a, int lda, \ 1830 const DeviceMemory<float> &b, int ldb, float beta, \ 1831 DeviceMemory<float> *c, int ldc) override; \ 1832 bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ 1833 blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ 1834 double alpha, const DeviceMemory<double> &a, int lda, \ 1835 const DeviceMemory<double> &b, int ldb, double beta, \ 1836 DeviceMemory<double> *c, int ldc) override; \ 1837 bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ 1838 blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ 1839 std::complex<float> alpha, \ 1840 const DeviceMemory<std::complex<float>> &a, int lda, \ 1841 const DeviceMemory<std::complex<float>> &b, int ldb, \ 1842 std::complex<float> beta, \ 1843 DeviceMemory<std::complex<float>> *c, int ldc) override; \ 1844 bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ 1845 blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ 1846 std::complex<double> alpha, \ 1847 const DeviceMemory<std::complex<double>> &a, int lda, \ 1848 const DeviceMemory<std::complex<double>> &b, int ldb, \ 1849 std::complex<double> beta, \ 1850 DeviceMemory<std::complex<double>> *c, int ldc) override; \ 1851 bool DoBlasGemmWithProfiling( \ 1852 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1853 uint64 m, uint64 n, uint64 k, float alpha, \ 1854 const DeviceMemory<Eigen::half> &a, int lda, \ 1855 const DeviceMemory<Eigen::half> &b, int ldb, float beta, \ 1856 DeviceMemory<Eigen::half> *c, int ldc, \ 1857 blas::ProfileResult *output_profile_result) override; \ 1858 bool DoBlasGemmWithProfiling( \ 1859 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1860 uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \ 1861 int lda, const DeviceMemory<float> &b, int ldb, float beta, \ 1862 DeviceMemory<float> *c, int ldc, \ 1863 blas::ProfileResult *output_profile_result) override; \ 1864 bool DoBlasGemmWithProfiling( \ 1865 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1866 uint64 m, uint64 n, uint64 k, double alpha, \ 1867 const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b, \ 1868 int ldb, double beta, DeviceMemory<double> *c, int ldc, \ 1869 blas::ProfileResult *output_profile_result) override; \ 1870 bool DoBlasGemmWithProfiling( \ 1871 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1872 uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \ 1873 const DeviceMemory<std::complex<float>> &a, int lda, \ 1874 const DeviceMemory<std::complex<float>> &b, int ldb, \ 1875 std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \ 1876 blas::ProfileResult *output_profile_result) override; \ 1877 bool DoBlasGemmWithProfiling( \ 1878 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1879 uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \ 1880 const DeviceMemory<std::complex<double>> &a, int lda, \ 1881 const DeviceMemory<std::complex<double>> &b, int ldb, \ 1882 std::complex<double> beta, DeviceMemory<std::complex<double>> *c, \ 1883 int ldc, blas::ProfileResult *output_profile_result) override; \ 1884 bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms) \ 1885 override; \ 1886 bool DoBlasGemmWithAlgorithm( \ 1887 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1888 uint64 m, uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, \ 1889 int lda, const DeviceMemory<int8> &b, int ldb, int beta, \ 1890 DeviceMemory<int> *c, int ldc, blas::ComputationType computation_type, \ 1891 blas::AlgorithmType algorithm, \ 1892 blas::ProfileResult *output_profile_result) override; \ 1893 bool DoBlasGemmWithAlgorithm( \ 1894 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1895 uint64 m, uint64 n, uint64 k, const Eigen::half &alpha, \ 1896 const DeviceMemory<Eigen::half> &a, int lda, \ 1897 const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta, \ 1898 DeviceMemory<Eigen::half> *c, int ldc, \ 1899 blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ 1900 blas::ProfileResult *output_profile_result) override; \ 1901 bool DoBlasGemmWithAlgorithm( \ 1902 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1903 uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \ 1904 int lda, const DeviceMemory<float> &b, int ldb, float beta, \ 1905 DeviceMemory<float> *c, int ldc, blas::ComputationType computation_type, \ 1906 blas::AlgorithmType algorithm, \ 1907 blas::ProfileResult *output_profile_result) override; \ 1908 bool DoBlasGemmWithAlgorithm( \ 1909 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1910 uint64 m, uint64 n, uint64 k, double alpha, \ 1911 const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b, \ 1912 int ldb, double beta, DeviceMemory<double> *c, int ldc, \ 1913 blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ 1914 blas::ProfileResult *output_profile_result) override; \ 1915 bool DoBlasGemmWithAlgorithm( \ 1916 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1917 uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \ 1918 const DeviceMemory<std::complex<float>> &a, int lda, \ 1919 const DeviceMemory<std::complex<float>> &b, int ldb, \ 1920 std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \ 1921 blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ 1922 blas::ProfileResult *output_profile_result) override; \ 1923 bool DoBlasGemmWithAlgorithm( \ 1924 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1925 uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \ 1926 const DeviceMemory<std::complex<double>> &a, int lda, \ 1927 const DeviceMemory<std::complex<double>> &b, int ldb, \ 1928 std::complex<double> beta, DeviceMemory<std::complex<double>> *c, \ 1929 int ldc, blas::ComputationType computation_type, \ 1930 blas::AlgorithmType algorithm, \ 1931 blas::ProfileResult *output_profile_result) override; \ 1932 bool DoBlasGemmBatched( \ 1933 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1934 uint64 m, uint64 n, uint64 k, float alpha, \ 1935 const port::ArraySlice<DeviceMemory<float> *> &a, int lda, \ 1936 const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, \ 1937 const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, \ 1938 int batch_count, ScratchAllocator *scratch_allocator) override; \ 1939 bool DoBlasGemmBatched( \ 1940 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1941 uint64 m, uint64 n, uint64 k, double alpha, \ 1942 const port::ArraySlice<DeviceMemory<double> *> &a, int lda, \ 1943 const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, \ 1944 const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, \ 1945 int batch_count, ScratchAllocator *scratch_allocator) override; \ 1946 bool DoBlasGemmBatched( \ 1947 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1948 uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \ 1949 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, \ 1950 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, \ 1951 std::complex<float> beta, \ 1952 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, \ 1953 int batch_count, ScratchAllocator *scratch_allocator) override; \ 1954 bool DoBlasGemmBatched( \ 1955 Stream *stream, blas::Transpose transa, blas::Transpose transb, \ 1956 uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \ 1957 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, \ 1958 int lda, \ 1959 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, \ 1960 int ldb, std::complex<double> beta, \ 1961 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, \ 1962 int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \ 1963 bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 1964 uint64 m, uint64 n, std::complex<float> alpha, \ 1965 const DeviceMemory<std::complex<float>> &a, int lda, \ 1966 const DeviceMemory<std::complex<float>> &b, int ldb, \ 1967 std::complex<float> beta, \ 1968 DeviceMemory<std::complex<float>> *c, int ldc) override; \ 1969 bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 1970 uint64 m, uint64 n, std::complex<double> alpha, \ 1971 const DeviceMemory<std::complex<double>> &a, int lda, \ 1972 const DeviceMemory<std::complex<double>> &b, int ldb, \ 1973 std::complex<double> beta, \ 1974 DeviceMemory<std::complex<double>> *c, int ldc) override; \ 1975 bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, \ 1976 blas::Transpose trans, uint64 n, uint64 k, float alpha, \ 1977 const DeviceMemory<std::complex<float>> &a, int lda, \ 1978 float beta, DeviceMemory<std::complex<float>> *c, int ldc) \ 1979 override; \ 1980 bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, \ 1981 blas::Transpose trans, uint64 n, uint64 k, double alpha, \ 1982 const DeviceMemory<std::complex<double>> &a, int lda, \ 1983 double beta, DeviceMemory<std::complex<double>> *c, int ldc) \ 1984 override; \ 1985 bool DoBlasHer2k( \ 1986 Stream *stream, blas::UpperLower uplo, blas::Transpose trans, uint64 n, \ 1987 uint64 k, std::complex<float> alpha, \ 1988 const DeviceMemory<std::complex<float>> &a, int lda, \ 1989 const DeviceMemory<std::complex<float>> &b, int ldb, float beta, \ 1990 DeviceMemory<std::complex<float>> *c, int ldc) override; \ 1991 bool DoBlasHer2k( \ 1992 Stream *stream, blas::UpperLower uplo, blas::Transpose trans, uint64 n, \ 1993 uint64 k, std::complex<double> alpha, \ 1994 const DeviceMemory<std::complex<double>> &a, int lda, \ 1995 const DeviceMemory<std::complex<double>> &b, int ldb, double beta, \ 1996 DeviceMemory<std::complex<double>> *c, int ldc) override; \ 1997 bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 1998 uint64 m, uint64 n, float alpha, \ 1999 const DeviceMemory<float> &a, int lda, \ 2000 const DeviceMemory<float> &b, int ldb, float beta, \ 2001 DeviceMemory<float> *c, int ldc) override; \ 2002 bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2003 uint64 m, uint64 n, double alpha, \ 2004 const DeviceMemory<double> &a, int lda, \ 2005 const DeviceMemory<double> &b, int ldb, double beta, \ 2006 DeviceMemory<double> *c, int ldc) override; \ 2007 bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2008 uint64 m, uint64 n, std::complex<float> alpha, \ 2009 const DeviceMemory<std::complex<float>> &a, int lda, \ 2010 const DeviceMemory<std::complex<float>> &b, int ldb, \ 2011 std::complex<float> beta, \ 2012 DeviceMemory<std::complex<float>> *c, int ldc) override; \ 2013 bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2014 uint64 m, uint64 n, std::complex<double> alpha, \ 2015 const DeviceMemory<std::complex<double>> &a, int lda, \ 2016 const DeviceMemory<std::complex<double>> &b, int ldb, \ 2017 std::complex<double> beta, \ 2018 DeviceMemory<std::complex<double>> *c, int ldc) override; \ 2019 bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ 2020 blas::Transpose trans, uint64 n, uint64 k, float alpha, \ 2021 const DeviceMemory<float> &a, int lda, float beta, \ 2022 DeviceMemory<float> *c, int ldc) override; \ 2023 bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ 2024 blas::Transpose trans, uint64 n, uint64 k, double alpha, \ 2025 const DeviceMemory<double> &a, int lda, double beta, \ 2026 DeviceMemory<double> *c, int ldc) override; \ 2027 bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ 2028 blas::Transpose trans, uint64 n, uint64 k, \ 2029 std::complex<float> alpha, \ 2030 const DeviceMemory<std::complex<float>> &a, int lda, \ 2031 std::complex<float> beta, \ 2032 DeviceMemory<std::complex<float>> *c, int ldc) override; \ 2033 bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ 2034 blas::Transpose trans, uint64 n, uint64 k, \ 2035 std::complex<double> alpha, \ 2036 const DeviceMemory<std::complex<double>> &a, int lda, \ 2037 std::complex<double> beta, \ 2038 DeviceMemory<std::complex<double>> *c, int ldc) override; \ 2039 bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ 2040 blas::Transpose trans, uint64 n, uint64 k, float alpha, \ 2041 const DeviceMemory<float> &a, int lda, \ 2042 const DeviceMemory<float> &b, int ldb, float beta, \ 2043 DeviceMemory<float> *c, int ldc) override; \ 2044 bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ 2045 blas::Transpose trans, uint64 n, uint64 k, double alpha, \ 2046 const DeviceMemory<double> &a, int lda, \ 2047 const DeviceMemory<double> &b, int ldb, double beta, \ 2048 DeviceMemory<double> *c, int ldc) override; \ 2049 bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ 2050 blas::Transpose trans, uint64 n, uint64 k, \ 2051 std::complex<float> alpha, \ 2052 const DeviceMemory<std::complex<float>> &a, int lda, \ 2053 const DeviceMemory<std::complex<float>> &b, int ldb, \ 2054 std::complex<float> beta, \ 2055 DeviceMemory<std::complex<float>> *c, int ldc) override; \ 2056 bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ 2057 blas::Transpose trans, uint64 n, uint64 k, \ 2058 std::complex<double> alpha, \ 2059 const DeviceMemory<std::complex<double>> &a, int lda, \ 2060 const DeviceMemory<std::complex<double>> &b, int ldb, \ 2061 std::complex<double> beta, \ 2062 DeviceMemory<std::complex<double>> *c, int ldc) override; \ 2063 bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2064 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2065 uint64 n, float alpha, const DeviceMemory<float> &a, \ 2066 int lda, DeviceMemory<float> *b, int ldb) override; \ 2067 bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2068 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2069 uint64 n, double alpha, const DeviceMemory<double> &a, \ 2070 int lda, DeviceMemory<double> *b, int ldb) override; \ 2071 bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2072 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2073 uint64 n, std::complex<float> alpha, \ 2074 const DeviceMemory<std::complex<float>> &a, int lda, \ 2075 DeviceMemory<std::complex<float>> *b, int ldb) override; \ 2076 bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2077 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2078 uint64 n, std::complex<double> alpha, \ 2079 const DeviceMemory<std::complex<double>> &a, int lda, \ 2080 DeviceMemory<std::complex<double>> *b, int ldb) override; \ 2081 bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2082 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2083 uint64 n, float alpha, const DeviceMemory<float> &a, \ 2084 int lda, DeviceMemory<float> *b, int ldb) override; \ 2085 bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2086 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2087 uint64 n, double alpha, const DeviceMemory<double> &a, \ 2088 int lda, DeviceMemory<double> *b, int ldb) override; \ 2089 bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2090 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2091 uint64 n, std::complex<float> alpha, \ 2092 const DeviceMemory<std::complex<float>> &a, int lda, \ 2093 DeviceMemory<std::complex<float>> *b, int ldb) override; \ 2094 bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ 2095 blas::Transpose transa, blas::Diagonal diag, uint64 m, \ 2096 uint64 n, std::complex<double> alpha, \ 2097 const DeviceMemory<std::complex<double>> &a, int lda, \ 2098 DeviceMemory<std::complex<double>> *b, int ldb) override; 2099 2100 } // namespace blas 2101 } // namespace gputools 2102 } // namespace perftools 2103 2104 #endif // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ 2105