Home | History | Annotate | Download | only in cuda
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "cuda/include/cublas_v2.h"
     17 #include "cuda/include/cuda.h"
     18 
     19 #define SE_CUDA_DATA_HALF CUDA_R_16F
     20 
     21 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
     22 
     23 // Both Eigen Half.h and CUDA cuda_fp16.h provide similar typedef for __half. As
     24 // such, there are two ways to get the typedef for __half:
     25 //
     26 // (1) Includes cuda_fp16.h and defines EIGEN_HAS_CUDA_FP16.
     27 // (2) Neither includes cuda_fp16.h nor defines EIGEN_HAS_CUDA_FP16.
     28 //
     29 // Due to issue b/73793421, when the first approach is used and NVCC is used to
     30 // compile this file, NVCC will complain duplicated definition for
     31 // EIGEN_HAS_CUDA_FP16. On the other hand, when the second approach is used and
     32 // clang is used to compile this file, clang will not understand __half
     33 // due to missing the definition and macro EIGEN_HAS_CUDA_FP16.
     34 //
     35 // Because this file may be compiled with CLANG but will never be compiled with
     36 // NVCC, we choose the first approach for CUDA < 9.0. For CUDA >= 9.0, we have
     37 // to use the second approach because the data member in the __half defined
     38 // by CUDA > 9.0 is `__x` while Eigen expects it to be `x`.
     39 //
     40 // TODO(b/73793421): Remove the following code block to switch to the second
     41 // approach when the issue is fixed.
     42 #if CUDA_VERSION < 9000
     43 #include "cuda/include/cuda_fp16.h"
     44 #define EIGEN_HAS_CUDA_FP16
     45 #endif
     46 
     47 #include "third_party/eigen3/Eigen/Core"
     48 
     49 #include <assert.h>
     50 #include <complex>
     51 
     52 #include "absl/strings/str_cat.h"
     53 #include "tensorflow/core/util/env_var.h"
     54 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
     55 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
     56 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
     57 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
     58 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
     59 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
     60 #include "tensorflow/stream_executor/device_memory.h"
     61 #include "tensorflow/stream_executor/lib/env.h"
     62 #include "tensorflow/stream_executor/lib/initialize.h"
     63 #include "tensorflow/stream_executor/lib/status.h"
     64 #include "tensorflow/stream_executor/lib/status_macros.h"
     65 #include "tensorflow/stream_executor/lib/stringprintf.h"
     66 #include "tensorflow/stream_executor/platform/logging.h"
     67 #include "tensorflow/stream_executor/platform/port.h"
     68 #include "tensorflow/stream_executor/plugin_registry.h"
     69 #include "tensorflow/stream_executor/scratch_allocator.h"
     70 #include "tensorflow/stream_executor/stream_executor.h"
     71 
     72 namespace stream_executor {
     73 namespace gpu {
     74 
     75 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
     76 
     77 static string ToString(cublasStatus_t status) {
     78   switch (status) {
     79     case CUBLAS_STATUS_SUCCESS:
     80       return "CUBLAS_STATUS_SUCCESS";
     81     case CUBLAS_STATUS_NOT_INITIALIZED:
     82       return "CUBLAS_STATUS_NOT_INITIALIZED";
     83     case CUBLAS_STATUS_ALLOC_FAILED:
     84       return "CUBLAS_STATUS_ALLOC_FAILED";
     85     case CUBLAS_STATUS_INVALID_VALUE:
     86       return "CUBLAS_STATUS_INVALID_VALUE";
     87     case CUBLAS_STATUS_ARCH_MISMATCH:
     88       return "CUBLAS_STATUS_ARCH_MISMATCH";
     89     case CUBLAS_STATUS_MAPPING_ERROR:
     90       return "CUBLAS_STATUS_MAPPING_ERROR";
     91     case CUBLAS_STATUS_EXECUTION_FAILED:
     92       return "CUBLAS_STATUS_EXECUTION_FAILED";
     93     case CUBLAS_STATUS_INTERNAL_ERROR:
     94       return "CUBLAS_STATUS_INTERNAL_ERROR";
     95 #if CUDA_VERSION >= 8000
     96     case CUBLAS_STATUS_NOT_SUPPORTED:
     97       return "CUBLAS_STATUS_NOT_SUPPORTED";
     98     case CUBLAS_STATUS_LICENSE_ERROR:
     99       return "CUBLAS_STATUS_LICENSE_ERROR";
    100 #endif
    101     default:
    102       return absl::StrCat("<invalid cublas status: ", status, ">");
    103   }
    104 }
    105 
    106 // Decide whether to enable TENSOR_OP_MATH
    107 static bool TensorOpMathEnabled() {
    108   static bool is_enabled = [] {
    109     bool is_disabled;
    110     TF_CHECK_OK(
    111         tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
    112                                        /*default_val=*/false, &is_disabled));
    113     return !is_disabled;
    114   }();
    115   return is_enabled;
    116 }
    117 
    118 // cuBLAS has interfaces that permit pointers to be passed from either the host
    119 // memory space or the device memory space; however, you must instruct it as to
    120 // which address space those pointers are in with cublasSetPointerMode.
    121 //
    122 // This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call
    123 // you are about to perform in a given scope.
    124 //
    125 // The prior cuBLAS pointer mode is retained and restored when this object goes
    126 // out of scope.
    127 class ScopedCublasPointerMode {
    128  public:
    129   // Note that, because the setting of the cublas pointer mode is fallible,
    130   // construction of this scoped datatype must be paired with a call to
    131   // Init().
    132   //
    133   // Parameters:
    134   //  handle: The cublas library handle to act upon in setting the pointer mode.
    135   explicit ScopedCublasPointerMode(cublasHandle_t handle)
    136       : handle_(handle), ok_(false) {}
    137 
    138   // Attempts the switch to the requested scoped pointer mode, new_mode.
    139   //
    140   // Note that when false is returned, an appropriate error has already been
    141   // logged.
    142   bool Init(cublasPointerMode_t new_mode) {
    143     cublasStatus_t ret = cublasGetPointerMode(handle_, &old_mode_);
    144     if (ret != CUBLAS_STATUS_SUCCESS) {
    145       LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret);
    146       return ok_ = false;
    147     }
    148 
    149     ret = cublasSetPointerMode(handle_, new_mode);
    150     if (ret != CUBLAS_STATUS_SUCCESS) {
    151       LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret);
    152       return ok_ = false;
    153     }
    154 
    155     return ok_ = true;
    156   }
    157 
    158   // Switches back to the prior pointer mode, if the switch operation was
    159   // successful in the first place.
    160   ~ScopedCublasPointerMode() {
    161     if (ok_) {
    162       cublasStatus_t ret = cublasSetPointerMode(handle_, old_mode_);
    163       if (ret != CUBLAS_STATUS_SUCCESS) {
    164         LOG(ERROR) << "failed to set former cublas pointer mode: "
    165                    << ToString(ret);
    166       }
    167     }
    168   }
    169 
    170  private:
    171   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
    172   cublasPointerMode_t old_mode_;  // Prior cuBLAS pointer mode, to be restored.
    173   bool ok_;                       // Whether the change was successful.
    174 };
    175 
    176 #if CUDA_VERSION >= 9000
    177 // cuBLAS has interfaces that permit computations to use the Volta hardware.
    178 // This must be enabled via the cublasGet/SetMathMode APIs.
    179 //
    180 // This helper sets the cuBLAS math mode to a desired value for a cuBLAS call
    181 // you are about to perform in a given scope.
    182 //
    183 // The prior cuBLAS math mode is retained and restored when this object goes
    184 // out of scope.
    185 class ScopedCublasMathMode {
    186  public:
    187   // Note that, because the setting of the cublas math mode is fallible,
    188   // construction of this scoped datatype must be paired with a call to
    189   // Init().
    190   //
    191   // Parameters:
    192   //  handle: The cublas library handle to act upon in setting the math mode.
    193   explicit ScopedCublasMathMode(cublasHandle_t handle)
    194       : handle_(handle), ok_(false) {}
    195 
    196   // Attempts the switch to the requested scoped math mode, new_mode.
    197   //
    198   // Note that when false is returned, an appropriate error has already been
    199   // logged.
    200   bool Init(cublasMath_t new_mode) {
    201     cublasStatus_t ret = cublasGetMathMode(handle_, &old_mode_);
    202     if (ret != CUBLAS_STATUS_SUCCESS) {
    203       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
    204       return ok_ = false;
    205     }
    206 
    207     ret = cublasSetMathMode(handle_, new_mode);
    208     if (ret != CUBLAS_STATUS_SUCCESS) {
    209       LOG(ERROR) << "failed to set new cublas math mode: " << ToString(ret);
    210       return ok_ = false;
    211     }
    212     return ok_ = true;
    213   }
    214 
    215   // Switches back to the prior math mode, if the switch operation was
    216   // successful in the first place.
    217   ~ScopedCublasMathMode() {
    218     if (ok_) {
    219       cublasStatus_t ret = cublasSetMathMode(handle_, old_mode_);
    220       if (ret != CUBLAS_STATUS_SUCCESS) {
    221         LOG(ERROR) << "failed to set former cublas math mode: "
    222                    << ToString(ret);
    223       }
    224     }
    225   }
    226 
    227  private:
    228   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
    229   cublasMath_t old_mode_;  // Prior cuBLAS math mode, to be restored.
    230   bool ok_;                // Whether the change was successful.
    231 };
    232 #endif  // CUDA_VERSION >= 9000
    233 
    234 bool CUDABlas::Init() {
    235   gpu::ScopedActivateExecutorContext sac{parent_};
    236   cublasStatus_t ret = cublasCreate(&blas_);
    237   if (ret != CUBLAS_STATUS_SUCCESS) {
    238     LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
    239     return false;
    240   }
    241 
    242   return true;
    243 }
    244 
    245 CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
    246     : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
    247 
    248 CUDABlas::~CUDABlas() {
    249   if (blas_ != nullptr) {
    250     gpu::ScopedActivateExecutorContext sac{parent_};
    251     cublasDestroy(blas_);
    252   }
    253 }
    254 
    255 bool CUDABlas::SetStream(Stream *stream) {
    256   CHECK(stream != nullptr);
    257   CHECK(AsGpuStreamValue(stream) != nullptr);
    258   CHECK(blas_ != nullptr);
    259   gpu::ScopedActivateExecutorContext sac{parent_};
    260   cublasStatus_t ret = cublasSetStream(blas_, AsGpuStreamValue(stream));
    261   if (ret != CUBLAS_STATUS_SUCCESS) {
    262     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
    263     return false;
    264   }
    265 
    266   return true;
    267 }
    268 
    269 namespace {
    270 
    271 // Helper functions transforming blas arguments into cuBLAS arguments.
    272 
    273 cublasOperation_t CUDABlasTranspose(blas::Transpose trans) {
    274   switch (trans) {
    275     case blas::Transpose::kNoTranspose:
    276       return CUBLAS_OP_N;
    277     case blas::Transpose::kTranspose:
    278       return CUBLAS_OP_T;
    279     case blas::Transpose::kConjugateTranspose:
    280       return CUBLAS_OP_C;
    281     default:
    282       LOG(FATAL) << "Invalid value of blas::Transpose.";
    283   }
    284 }
    285 
    286 cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) {
    287   switch (uplo) {
    288     case blas::UpperLower::kUpper:
    289       return CUBLAS_FILL_MODE_UPPER;
    290     case blas::UpperLower::kLower:
    291       return CUBLAS_FILL_MODE_LOWER;
    292     default:
    293       LOG(FATAL) << "Invalid value of blas::UpperLower.";
    294   }
    295 }
    296 
    297 cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) {
    298   switch (diag) {
    299     case blas::Diagonal::kUnit:
    300       return CUBLAS_DIAG_UNIT;
    301     case blas::Diagonal::kNonUnit:
    302       return CUBLAS_DIAG_NON_UNIT;
    303     default:
    304       LOG(FATAL) << "Invalid value of blas::Diagonal.";
    305   }
    306 }
    307 
    308 cublasSideMode_t CUDABlasSide(blas::Side side) {
    309   switch (side) {
    310     case blas::Side::kLeft:
    311       return CUBLAS_SIDE_LEFT;
    312     case blas::Side::kRight:
    313       return CUBLAS_SIDE_RIGHT;
    314     default:
    315       LOG(FATAL) << "Invalid value of blas::Side.";
    316   }
    317 }
    318 
    319 // CUDADataType<T>::type translates from a C++ type (e.g. float) to a
    320 // cudaDataType_t (e.g. CUDA_R_32F).  CUDAComputationType(ty) translates from a
    321 // blas::ComputationType to a cudaDataType_t.
    322 //
    323 // These are used to build the argument type and computation type args to
    324 // cublasGemmEx.
    325 template <typename T>
    326 struct CUDADataType;
    327 
    328 template <>
    329 struct CUDADataType<Eigen::half> {
    330   static constexpr cudaDataType_t type = SE_CUDA_DATA_HALF;
    331 };
    332 
    333 template <>
    334 struct CUDADataType<std::complex<Eigen::half>> {
    335   static constexpr cudaDataType_t type = CUDA_C_16F;
    336 };
    337 
    338 template <>
    339 struct CUDADataType<float> {
    340   static constexpr cudaDataType_t type = CUDA_R_32F;
    341 };
    342 
    343 template <>
    344 struct CUDADataType<std::complex<float>> {
    345   static constexpr cudaDataType_t type = CUDA_C_32F;
    346 };
    347 
    348 template <>
    349 struct CUDADataType<double> {
    350   static constexpr cudaDataType_t type = CUDA_R_64F;
    351 };
    352 
    353 template <>
    354 struct CUDADataType<std::complex<double>> {
    355   static constexpr cudaDataType_t type = CUDA_C_64F;
    356 };
    357 
    358 template <>
    359 struct CUDADataType<int> {
    360   static constexpr cudaDataType_t type = CUDA_R_32I;
    361 };
    362 
    363 template <>
    364 struct CUDADataType<int8> {
    365   static constexpr cudaDataType_t type = CUDA_R_8I;
    366 };
    367 
    368 template <>
    369 struct CUDADataType<std::complex<int8>> {
    370   static constexpr cudaDataType_t type = CUDA_C_8I;
    371 };
    372 
    373 template <>
    374 struct CUDADataType<uint8> {
    375   static constexpr cudaDataType_t type = CUDA_R_8U;
    376 };
    377 
    378 template <>
    379 struct CUDADataType<std::complex<uint8>> {
    380   static constexpr cudaDataType_t type = CUDA_C_8U;
    381 };
    382 
    383 cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
    384   switch (ty) {
    385     case blas::ComputationType::kF16:
    386       return CUDA_R_16F;
    387     case blas::ComputationType::kF32:
    388       return CUDA_R_32F;
    389     case blas::ComputationType::kF64:
    390       return CUDA_R_64F;
    391     case blas::ComputationType::kI32:
    392       return CUDA_R_32I;
    393     case blas::ComputationType::kComplexF32:
    394       return CUDA_C_32F;
    395     case blas::ComputationType::kComplexF64:
    396       return CUDA_C_64F;
    397   }
    398 }
    399 }  // namespace
    400 
    401 template <typename FuncT, typename... Args>
    402 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
    403                                   bool pointer_mode_host, bool err_on_failure,
    404                                   bool use_tensor_op_math, Args... args) {
    405   mutex_lock lock(mu_);
    406 
    407   CHECK(blas_ != nullptr);
    408   if (!SetStream(stream)) {
    409     return false;
    410   }
    411 
    412   gpu::ScopedActivateExecutorContext sac{parent_};
    413   ScopedCublasPointerMode pointer_mode{blas_};
    414   if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
    415                                            : CUBLAS_POINTER_MODE_DEVICE)) {
    416     return false;
    417   }
    418 #if CUDA_VERSION >= 9000
    419   ScopedCublasMathMode math_mode{blas_};
    420   if (use_tensor_op_math) {
    421     if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
    422       return false;
    423     }
    424   }
    425 #endif
    426   cublasStatus_t ret = cublas_func(blas_, args...);
    427   if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) {
    428     LOG(ERROR) << "failed to run cuBLAS routine: " << ToString(ret);
    429   }
    430   return ret == CUBLAS_STATUS_SUCCESS;
    431 }
    432 
    433 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
    434                           const DeviceMemory<float> &x, int incx,
    435                           DeviceMemory<float> *result) {
    436   return DoBlasInternal(cublasSasum, stream, false /* = pointer_mode_host */,
    437                         elem_count, GpuMemory(x), incx,
    438                         GpuMemoryMutable(result));
    439 }
    440 
    441 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
    442                           const DeviceMemory<double> &x, int incx,
    443                           DeviceMemory<double> *result) {
    444   return DoBlasInternal(cublasDasum, stream, false /* = pointer_mode_host */,
    445                         elem_count, GpuMemory(x), incx,
    446                         GpuMemoryMutable(result));
    447 }
    448 
    449 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
    450                           const DeviceMemory<std::complex<float>> &x, int incx,
    451                           DeviceMemory<float> *result) {
    452   return DoBlasInternal(cublasScasum, stream, false /* = pointer_mode_host */,
    453                         elem_count, GpuComplex(GpuMemory(x)), incx,
    454                         GpuMemoryMutable(result));
    455 }
    456 
    457 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
    458                           const DeviceMemory<std::complex<double>> &x, int incx,
    459                           DeviceMemory<double> *result) {
    460   return DoBlasInternal(cublasDzasum, stream, false /* = pointer_mode_host */,
    461                         elem_count, GpuComplex(GpuMemory(x)), incx,
    462                         GpuMemoryMutable(result));
    463 }
    464 
    465 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
    466                           const DeviceMemory<float> &x, int incx,
    467                           DeviceMemory<float> *y, int incy) {
    468   return DoBlasInternal(cublasSaxpy, stream, true /* = pointer_mode_host */,
    469                         elem_count, &alpha, GpuMemory(x), incx,
    470                         GpuMemoryMutable(y), incy);
    471 }
    472 
    473 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
    474                           const DeviceMemory<double> &x, int incx,
    475                           DeviceMemory<double> *y, int incy) {
    476   return DoBlasInternal(cublasDaxpy, stream, true /* = pointer_mode_host */,
    477                         elem_count, &alpha, GpuMemory(x), incx,
    478                         GpuMemoryMutable(y), incy);
    479 }
    480 
    481 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
    482                           std::complex<float> alpha,
    483                           const DeviceMemory<std::complex<float>> &x, int incx,
    484                           DeviceMemory<std::complex<float>> *y, int incy) {
    485   return DoBlasInternal(cublasCaxpy, stream, true /* = pointer_mode_host */,
    486                         elem_count, GpuComplex(&alpha),
    487                         GpuComplex(GpuMemory(x)), incx,
    488                         GpuComplex(GpuMemoryMutable(y)), incy);
    489 }
    490 
    491 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
    492                           std::complex<double> alpha,
    493                           const DeviceMemory<std::complex<double>> &x, int incx,
    494                           DeviceMemory<std::complex<double>> *y, int incy) {
    495   return DoBlasInternal(cublasZaxpy, stream, true /* = pointer_mode_host */,
    496                         elem_count, GpuComplex(&alpha),
    497                         GpuComplex(GpuMemory(x)), incx,
    498                         GpuComplex(GpuMemoryMutable(y)), incy);
    499 }
    500 
    501 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
    502                           const DeviceMemory<float> &x, int incx,
    503                           DeviceMemory<float> *y, int incy) {
    504   return DoBlasInternal(cublasScopy, stream, true /* = pointer_mode_host */,
    505                         elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
    506                         incy);
    507 }
    508 
    509 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
    510                           const DeviceMemory<double> &x, int incx,
    511                           DeviceMemory<double> *y, int incy) {
    512   return DoBlasInternal(cublasDcopy, stream, true /* = pointer_mode_host */,
    513                         elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
    514                         incy);
    515 }
    516 
    517 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
    518                           const DeviceMemory<std::complex<float>> &x, int incx,
    519                           DeviceMemory<std::complex<float>> *y, int incy) {
    520   return DoBlasInternal(cublasCcopy, stream, true /* = pointer_mode_host */,
    521                         elem_count, GpuComplex(GpuMemory(x)), incx,
    522                         GpuComplex(GpuMemoryMutable(y)), incy);
    523 }
    524 
    525 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
    526                           const DeviceMemory<std::complex<double>> &x, int incx,
    527                           DeviceMemory<std::complex<double>> *y, int incy) {
    528   return DoBlasInternal(cublasZcopy, stream, true /* = pointer_mode_host */,
    529                         elem_count, GpuComplex(GpuMemory(x)), incx,
    530                         GpuComplex(GpuMemoryMutable(y)), incy);
    531 }
    532 
    533 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
    534                          const DeviceMemory<float> &x, int incx,
    535                          const DeviceMemory<float> &y, int incy,
    536                          DeviceMemory<float> *result) {
    537   return DoBlasInternal(cublasSdot, stream, false /* = pointer_mode_host */,
    538                         elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
    539                         GpuMemoryMutable(result));
    540 }
    541 
    542 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
    543                          const DeviceMemory<double> &x, int incx,
    544                          const DeviceMemory<double> &y, int incy,
    545                          DeviceMemory<double> *result) {
    546   return DoBlasInternal(cublasDdot, stream, false /* = pointer_mode_host */,
    547                         elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
    548                         GpuMemoryMutable(result));
    549 }
    550 
    551 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
    552                           const DeviceMemory<std::complex<float>> &x, int incx,
    553                           const DeviceMemory<std::complex<float>> &y, int incy,
    554                           DeviceMemory<std::complex<float>> *result) {
    555   return DoBlasInternal(cublasCdotc, stream, false /* = pointer_mode_host */,
    556                         elem_count, GpuComplex(GpuMemory(x)), incx,
    557                         GpuComplex(GpuMemory(y)), incy,
    558                         GpuComplex(GpuMemoryMutable(result)));
    559 }
    560 
    561 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
    562                           const DeviceMemory<std::complex<double>> &x, int incx,
    563                           const DeviceMemory<std::complex<double>> &y, int incy,
    564                           DeviceMemory<std::complex<double>> *result) {
    565   return DoBlasInternal(cublasZdotc, stream, false /* = pointer_mode_host */,
    566                         elem_count, GpuComplex(GpuMemory(x)), incx,
    567                         GpuComplex(GpuMemory(y)), incy,
    568                         GpuComplex(GpuMemoryMutable(result)));
    569 }
    570 
    571 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
    572                           const DeviceMemory<std::complex<float>> &x, int incx,
    573                           const DeviceMemory<std::complex<float>> &y, int incy,
    574                           DeviceMemory<std::complex<float>> *result) {
    575   return DoBlasInternal(cublasCdotu, stream, false /* = pointer_mode_host */,
    576                         elem_count, GpuComplex(GpuMemory(x)), incx,
    577                         GpuComplex(GpuMemory(y)), incy,
    578                         GpuComplex(GpuMemoryMutable(result)));
    579 }
    580 
    581 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
    582                           const DeviceMemory<std::complex<double>> &x, int incx,
    583                           const DeviceMemory<std::complex<double>> &y, int incy,
    584                           DeviceMemory<std::complex<double>> *result) {
    585   return DoBlasInternal(cublasZdotu, stream, false /* = pointer_mode_host */,
    586                         elem_count, GpuComplex(GpuMemory(x)), incx,
    587                         GpuComplex(GpuMemory(y)), incy,
    588                         GpuComplex(GpuMemoryMutable(result)));
    589 }
    590 
    591 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
    592                           const DeviceMemory<float> &x, int incx,
    593                           DeviceMemory<float> *result) {
    594   return DoBlasInternal(cublasSnrm2, stream, false /* = pointer_mode_host */,
    595                         elem_count, GpuMemory(x), incx,
    596                         GpuMemoryMutable(result));
    597 }
    598 
    599 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
    600                           const DeviceMemory<double> &x, int incx,
    601                           DeviceMemory<double> *result) {
    602   return DoBlasInternal(cublasDnrm2, stream, false /* = pointer_mode_host */,
    603                         elem_count, GpuMemory(x), incx,
    604                         GpuMemoryMutable(result));
    605 }
    606 
    607 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
    608                           const DeviceMemory<std::complex<float>> &x, int incx,
    609                           DeviceMemory<float> *result) {
    610   return DoBlasInternal(cublasScnrm2, stream, false /* = pointer_mode_host */,
    611                         elem_count, GpuComplex(GpuMemory(x)), incx,
    612                         GpuMemoryMutable(result));
    613 }
    614 
    615 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
    616                           const DeviceMemory<std::complex<double>> &x, int incx,
    617                           DeviceMemory<double> *result) {
    618   return DoBlasInternal(cublasDznrm2, stream, false /* = pointer_mode_host */,
    619                         elem_count, GpuComplex(GpuMemory(x)), incx,
    620                         GpuMemoryMutable(result));
    621 }
    622 
    623 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
    624                          DeviceMemory<float> *x, int incx,
    625                          DeviceMemory<float> *y, int incy, float c, float s) {
    626   return DoBlasInternal(cublasSrot, stream, true /* = pointer_mode_host */,
    627                         elem_count, GpuMemoryMutable(x), incx,
    628                         GpuMemoryMutable(y), incy, &c, &s);
    629 }
    630 
    631 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
    632                          DeviceMemory<double> *x, int incx,
    633                          DeviceMemory<double> *y, int incy, double c,
    634                          double s) {
    635   return DoBlasInternal(cublasDrot, stream, true /* = pointer_mode_host */,
    636                         elem_count, GpuMemoryMutable(x), incx,
    637                         GpuMemoryMutable(y), incy, &c, &s);
    638 }
    639 
    640 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
    641                          DeviceMemory<std::complex<float>> *x, int incx,
    642                          DeviceMemory<std::complex<float>> *y, int incy,
    643                          float c, float s) {
    644   return DoBlasInternal(cublasCsrot, stream, true /* = pointer_mode_host */,
    645                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
    646                         GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
    647 }
    648 
    649 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
    650                          DeviceMemory<std::complex<double>> *x, int incx,
    651                          DeviceMemory<std::complex<double>> *y, int incy,
    652                          double c, double s) {
    653   return DoBlasInternal(cublasZdrot, stream, true /* = pointer_mode_host */,
    654                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
    655                         GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
    656 }
    657 
    658 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
    659                           DeviceMemory<float> *b, DeviceMemory<float> *c,
    660                           DeviceMemory<float> *s) {
    661   return DoBlasInternal(cublasSrotg, stream, false /* = pointer_mode_host */,
    662                         GpuMemoryMutable(a), GpuMemoryMutable(b),
    663                         GpuMemoryMutable(c), GpuMemoryMutable(s));
    664 }
    665 
    666 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
    667                           DeviceMemory<double> *b, DeviceMemory<double> *c,
    668                           DeviceMemory<double> *s) {
    669   return DoBlasInternal(cublasDrotg, stream, false /* = pointer_mode_host */,
    670                         GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b),
    671                         GpuMemoryMutable(c), GpuMemoryMutable(s));
    672 }
    673 
    674 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
    675                           DeviceMemory<std::complex<float>> *b,
    676                           DeviceMemory<float> *c,
    677                           DeviceMemory<std::complex<float>> *s) {
    678   return DoBlasInternal(
    679       cublasCrotg, stream, false /* = pointer_mode_host */,
    680       GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
    681       GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
    682 }
    683 
    684 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
    685                           DeviceMemory<std::complex<double>> *b,
    686                           DeviceMemory<double> *c,
    687                           DeviceMemory<std::complex<double>> *s) {
    688   return DoBlasInternal(
    689       cublasZrotg, stream, false /* = pointer_mode_host */,
    690       GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
    691       GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
    692 }
    693 
    694 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
    695                           DeviceMemory<float> *x, int incx,
    696                           DeviceMemory<float> *y, int incy,
    697                           const DeviceMemory<float> &param) {
    698   return DoBlasInternal(cublasSrotm, stream, false /* = pointer_mode_host */,
    699                         elem_count, GpuMemoryMutable(x), incx,
    700                         GpuMemoryMutable(y), incy, GpuMemory(param));
    701 }
    702 
    703 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
    704                           DeviceMemory<double> *x, int incx,
    705                           DeviceMemory<double> *y, int incy,
    706                           const DeviceMemory<double> &param) {
    707   return DoBlasInternal(cublasDrotm, stream, false /* = pointer_mode_host */,
    708                         elem_count, GpuMemoryMutable(x), incx,
    709                         GpuMemoryMutable(y), incy, GpuMemory(param));
    710 }
    711 
    712 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
    713                            DeviceMemory<float> *d2, DeviceMemory<float> *x1,
    714                            const DeviceMemory<float> &y1,
    715                            DeviceMemory<float> *param) {
    716   return DoBlasInternal(cublasSrotmg, stream, false /* = pointer_mode_host */,
    717                         GpuMemoryMutable(d1), GpuMemoryMutable(d2),
    718                         GpuMemoryMutable(x1), GpuMemory(y1),
    719                         GpuMemoryMutable(param));
    720 }
    721 
    722 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
    723                            DeviceMemory<double> *d2, DeviceMemory<double> *x1,
    724                            const DeviceMemory<double> &y1,
    725                            DeviceMemory<double> *param) {
    726   return DoBlasInternal(cublasDrotmg, stream, false /* = pointer_mode_host */,
    727                         GpuMemoryMutable(d1), GpuMemoryMutable(d2),
    728                         GpuMemoryMutable(x1), GpuMemory(y1),
    729                         GpuMemoryMutable(param));
    730 }
    731 
    732 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
    733                           DeviceMemory<float> *x, int incx) {
    734   return DoBlasInternal(cublasSscal, stream, true /* = pointer_mode_host */,
    735                         elem_count, &alpha, GpuMemoryMutable(x), incx);
    736 }
    737 
    738 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
    739                           DeviceMemory<double> *x, int incx) {
    740   return DoBlasInternal(cublasDscal, stream, true /* = pointer_mode_host */,
    741                         elem_count, &alpha, GpuMemoryMutable(x), incx);
    742 }
    743 
    744 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
    745                           DeviceMemory<std::complex<float>> *x, int incx) {
    746   return DoBlasInternal(cublasCsscal, stream, true /* = pointer_mode_host */,
    747                         elem_count, GpuComplex(&alpha),
    748                         GpuComplex(GpuMemoryMutable(x)), incx);
    749 }
    750 
    751 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
    752                           DeviceMemory<std::complex<double>> *x, int incx) {
    753   return DoBlasInternal(cublasZdscal, stream, true /* = pointer_mode_host */,
    754                         elem_count, GpuComplex(&alpha),
    755                         GpuComplex(GpuMemoryMutable(x)), incx);
    756 }
    757 
    758 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
    759                           std::complex<float> alpha,
    760                           DeviceMemory<std::complex<float>> *x, int incx) {
    761   return DoBlasInternal(cublasCscal, stream, true /* = pointer_mode_host */,
    762                         elem_count, GpuComplex(&alpha),
    763                         GpuComplex(GpuMemoryMutable(x)), incx);
    764 }
    765 
    766 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
    767                           std::complex<double> alpha,
    768                           DeviceMemory<std::complex<double>> *x, int incx) {
    769   return DoBlasInternal(cublasZscal, stream, true /* = pointer_mode_host */,
    770                         elem_count, GpuComplex(&alpha),
    771                         GpuComplex(GpuMemoryMutable(x)), incx);
    772 }
    773 
    774 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
    775                           DeviceMemory<float> *x, int incx,
    776                           DeviceMemory<float> *y, int incy) {
    777   return DoBlasInternal(cublasSswap, stream, true /* = pointer_mode_host */,
    778                         elem_count, GpuMemoryMutable(x), incx,
    779                         GpuMemoryMutable(y), incy);
    780 }
    781 
    782 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
    783                           DeviceMemory<double> *x, int incx,
    784                           DeviceMemory<double> *y, int incy) {
    785   return DoBlasInternal(cublasDswap, stream, true /* = pointer_mode_host */,
    786                         elem_count, GpuMemoryMutable(x), incx,
    787                         GpuMemoryMutable(y), incy);
    788 }
    789 
    790 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
    791                           DeviceMemory<std::complex<float>> *x, int incx,
    792                           DeviceMemory<std::complex<float>> *y, int incy) {
    793   return DoBlasInternal(cublasCswap, stream, true /* = pointer_mode_host */,
    794                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
    795                         GpuComplex(GpuMemoryMutable(y)), incy);
    796 }
    797 
    798 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
    799                           DeviceMemory<std::complex<double>> *x, int incx,
    800                           DeviceMemory<std::complex<double>> *y, int incy) {
    801   return DoBlasInternal(cublasZswap, stream, true /* = pointer_mode_host */,
    802                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
    803                         GpuComplex(GpuMemoryMutable(y)), incy);
    804 }
    805 
    806 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
    807                            const DeviceMemory<float> &x, int incx,
    808                            DeviceMemory<int> *result) {
    809   return DoBlasInternal(cublasIsamax, stream, false /* = pointer_mode_host */,
    810                         elem_count, GpuMemory(x), incx,
    811                         GpuMemoryMutable(result));
    812 }
    813 
    814 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
    815                            const DeviceMemory<double> &x, int incx,
    816                            DeviceMemory<int> *result) {
    817   return DoBlasInternal(cublasIdamax, stream, false /* = pointer_mode_host */,
    818                         elem_count, GpuMemory(x), incx,
    819                         GpuMemoryMutable(result));
    820 }
    821 
    822 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
    823                            const DeviceMemory<std::complex<float>> &x, int incx,
    824                            DeviceMemory<int> *result) {
    825   return DoBlasInternal(cublasIcamax, stream, false /* = pointer_mode_host */,
    826                         elem_count, GpuComplex(GpuMemory(x)), incx,
    827                         GpuMemoryMutable(result));
    828 }
    829 
    830 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
    831                            const DeviceMemory<std::complex<double>> &x,
    832                            int incx, DeviceMemory<int> *result) {
    833   return DoBlasInternal(cublasIzamax, stream, false /* = pointer_mode_host */,
    834                         elem_count, GpuComplex(GpuMemory(x)), incx,
    835                         GpuMemoryMutable(result));
    836 }
    837 
    838 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
    839                            const DeviceMemory<float> &x, int incx,
    840                            DeviceMemory<int> *result) {
    841   return DoBlasInternal(cublasIsamin, stream, false /* = pointer_mode_host */,
    842                         elem_count, GpuComplex(GpuMemory(x)), incx,
    843                         GpuMemoryMutable(result));
    844 }
    845 
    846 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
    847                            const DeviceMemory<double> &x, int incx,
    848                            DeviceMemory<int> *result) {
    849   return DoBlasInternal(cublasIdamin, stream, false /* = pointer_mode_host */,
    850                         elem_count, GpuComplex(GpuMemory(x)), incx,
    851                         GpuMemoryMutable(result));
    852 }
    853 
    854 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
    855                            const DeviceMemory<std::complex<float>> &x, int incx,
    856                            DeviceMemory<int> *result) {
    857   return DoBlasInternal(cublasIcamin, stream, false /* = pointer_mode_host */,
    858                         elem_count, GpuComplex(GpuMemory(x)), incx,
    859                         GpuMemoryMutable(result));
    860 }
    861 
    862 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
    863                            const DeviceMemory<std::complex<double>> &x,
    864                            int incx, DeviceMemory<int> *result) {
    865   return DoBlasInternal(cublasIzamin, stream, false /* = pointer_mode_host */,
    866                         elem_count, GpuComplex(GpuMemory(x)), incx,
    867                         GpuMemoryMutable(result));
    868 }
    869 
    870 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    871                           uint64 n, uint64 kl, uint64 ku, float alpha,
    872                           const DeviceMemory<float> &a, int lda,
    873                           const DeviceMemory<float> &x, int incx, float beta,
    874                           DeviceMemory<float> *y, int incy) {
    875   return DoBlasInternal(cublasSgbmv, stream, true /* = pointer_mode_host */,
    876                         CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
    877                         GpuMemory(a), lda, GpuMemory(x), incx, &beta,
    878                         GpuMemoryMutable(y), incy);
    879 }
    880 
    881 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    882                           uint64 n, uint64 kl, uint64 ku, double alpha,
    883                           const DeviceMemory<double> &a, int lda,
    884                           const DeviceMemory<double> &x, int incx, double beta,
    885                           DeviceMemory<double> *y, int incy) {
    886   return DoBlasInternal(cublasDgbmv, stream, true /* = pointer_mode_host */,
    887                         CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
    888                         GpuMemory(a), lda, GpuMemory(x), incx, &beta,
    889                         GpuMemoryMutable(y), incy);
    890 }
    891 
    892 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    893                           uint64 n, uint64 kl, uint64 ku,
    894                           std::complex<float> alpha,
    895                           const DeviceMemory<std::complex<float>> &a, int lda,
    896                           const DeviceMemory<std::complex<float>> &x, int incx,
    897                           std::complex<float> beta,
    898                           DeviceMemory<std::complex<float>> *y, int incy) {
    899   return DoBlasInternal(cublasCgbmv, stream, true /* = pointer_mode_host */,
    900                         CUDABlasTranspose(trans), m, n, kl, ku,
    901                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
    902                         GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
    903                         GpuComplex(GpuMemoryMutable(y)), incy);
    904 }
    905 
    906 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
    907                           uint64 n, uint64 kl, uint64 ku,
    908                           std::complex<double> alpha,
    909                           const DeviceMemory<std::complex<double>> &a, int lda,
    910                           const DeviceMemory<std::complex<double>> &x, int incx,
    911                           std::complex<double> beta,
    912                           DeviceMemory<std::complex<double>> *y, int incy) {
    913   return DoBlasInternal(cublasZgbmv, stream, true /* = pointer_mode_host */,
    914                         CUDABlasTranspose(trans), m, n, kl, ku,
    915                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
    916                         GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
    917                         GpuComplex(GpuMemoryMutable(y)), incy);
    918 }
    919 
    920 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    921                           uint64 n, float alpha, const DeviceMemory<float> &a,
    922                           int lda, const DeviceMemory<float> &x, int incx,
    923                           float beta, DeviceMemory<float> *y, int incy) {
    924   return DoBlasInternal(cublasSgemv, stream, true /* = pointer_mode_host */,
    925                         CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
    926                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
    927                         incy);
    928 }
    929 
    930 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    931                           uint64 n, double alpha, const DeviceMemory<double> &a,
    932                           int lda, const DeviceMemory<double> &x, int incx,
    933                           double beta, DeviceMemory<double> *y, int incy) {
    934   return DoBlasInternal(cublasDgemv, stream, true /* = pointer_mode_host */,
    935                         CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
    936                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
    937                         incy);
    938 }
    939 
    940 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    941                           uint64 n, std::complex<float> alpha,
    942                           const DeviceMemory<std::complex<float>> &a, int lda,
    943                           const DeviceMemory<std::complex<float>> &x, int incx,
    944                           std::complex<float> beta,
    945                           DeviceMemory<std::complex<float>> *y, int incy) {
    946   return DoBlasInternal(cublasCgemv, stream, true /* = pointer_mode_host */,
    947                         CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
    948                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
    949                         incx, GpuComplex(&beta),
    950                         GpuComplex(GpuMemoryMutable(y)), incy);
    951 }
    952 
    953 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
    954                           uint64 n, std::complex<double> alpha,
    955                           const DeviceMemory<std::complex<double>> &a, int lda,
    956                           const DeviceMemory<std::complex<double>> &x, int incx,
    957                           std::complex<double> beta,
    958                           DeviceMemory<std::complex<double>> *y, int incy) {
    959   return DoBlasInternal(cublasZgemv, stream, true /* = pointer_mode_host */,
    960                         CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
    961                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
    962                         incx, GpuComplex(&beta),
    963                         GpuComplex(GpuMemoryMutable(y)), incy);
    964 }
    965 
    966 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
    967                          const DeviceMemory<float> &x, int incx,
    968                          const DeviceMemory<float> &y, int incy,
    969                          DeviceMemory<float> *a, int lda) {
    970   return DoBlasInternal(cublasSger, stream, true /* = pointer_mode_host */, m,
    971                         n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
    972                         GpuMemoryMutable(a), lda);
    973 }
    974 
    975 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
    976                          const DeviceMemory<double> &x, int incx,
    977                          const DeviceMemory<double> &y, int incy,
    978                          DeviceMemory<double> *a, int lda) {
    979   return DoBlasInternal(cublasDger, stream, true /* = pointer_mode_host */, m,
    980                         n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
    981                         GpuMemoryMutable(a), lda);
    982 }
    983 
    984 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
    985                           std::complex<float> alpha,
    986                           const DeviceMemory<std::complex<float>> &x, int incx,
    987                           const DeviceMemory<std::complex<float>> &y, int incy,
    988                           DeviceMemory<std::complex<float>> *a, int lda) {
    989   return DoBlasInternal(cublasCgerc, stream, true /* = pointer_mode_host */, m,
    990                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
    991                         GpuComplex(GpuMemory(y)), incy,
    992                         GpuComplex(GpuMemoryMutable(a)), lda);
    993 }
    994 
    995 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
    996                           std::complex<double> alpha,
    997                           const DeviceMemory<std::complex<double>> &x, int incx,
    998                           const DeviceMemory<std::complex<double>> &y, int incy,
    999                           DeviceMemory<std::complex<double>> *a, int lda) {
   1000   return DoBlasInternal(cublasZgerc, stream, true /* = pointer_mode_host */, m,
   1001                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
   1002                         GpuComplex(GpuMemory(y)), incy,
   1003                         GpuComplex(GpuMemoryMutable(a)), lda);
   1004 }
   1005 
   1006 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
   1007                           std::complex<float> alpha,
   1008                           const DeviceMemory<std::complex<float>> &x, int incx,
   1009                           const DeviceMemory<std::complex<float>> &y, int incy,
   1010                           DeviceMemory<std::complex<float>> *a, int lda) {
   1011   return DoBlasInternal(cublasCgeru, stream, true /* = pointer_mode_host */, m,
   1012                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
   1013                         GpuComplex(GpuMemory(y)), incy,
   1014                         GpuComplex(GpuMemoryMutable(a)), lda);
   1015 }
   1016 
   1017 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
   1018                           std::complex<double> alpha,
   1019                           const DeviceMemory<std::complex<double>> &x, int incx,
   1020                           const DeviceMemory<std::complex<double>> &y, int incy,
   1021                           DeviceMemory<std::complex<double>> *a, int lda) {
   1022   return DoBlasInternal(cublasZgeru, stream, true /* = pointer_mode_host */, m,
   1023                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
   1024                         GpuComplex(GpuMemory(y)), incy,
   1025                         GpuComplex(GpuMemoryMutable(a)), lda);
   1026 }
   1027 
   1028 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1029                           uint64 k, std::complex<float> alpha,
   1030                           const DeviceMemory<std::complex<float>> &a, int lda,
   1031                           const DeviceMemory<std::complex<float>> &x, int incx,
   1032                           std::complex<float> beta,
   1033                           DeviceMemory<std::complex<float>> *y, int incy) {
   1034   return DoBlasInternal(cublasChbmv, stream, true /* = pointer_mode_host */,
   1035                         CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
   1036                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
   1037                         incx, GpuComplex(&beta),
   1038                         GpuComplex(GpuMemoryMutable(y)), incy);
   1039 }
   1040 
   1041 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1042                           uint64 k, std::complex<double> alpha,
   1043                           const DeviceMemory<std::complex<double>> &a, int lda,
   1044                           const DeviceMemory<std::complex<double>> &x, int incx,
   1045                           std::complex<double> beta,
   1046                           DeviceMemory<std::complex<double>> *y, int incy) {
   1047   return DoBlasInternal(cublasZhbmv, stream, true /* = pointer_mode_host */,
   1048                         CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
   1049                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
   1050                         incx, GpuComplex(&beta),
   1051                         GpuComplex(GpuMemoryMutable(y)), incy);
   1052 }
   1053 
   1054 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1055                           std::complex<float> alpha,
   1056                           const DeviceMemory<std::complex<float>> &a, int lda,
   1057                           const DeviceMemory<std::complex<float>> &x, int incx,
   1058                           std::complex<float> beta,
   1059                           DeviceMemory<std::complex<float>> *y, int incy) {
   1060   return DoBlasInternal(cublasChemv, stream, true /* = pointer_mode_host */,
   1061                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1062                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
   1063                         incx, GpuComplex(&beta),
   1064                         GpuComplex(GpuMemoryMutable(y)), incy);
   1065 }
   1066 
   1067 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1068                           std::complex<double> alpha,
   1069                           const DeviceMemory<std::complex<double>> &a, int lda,
   1070                           const DeviceMemory<std::complex<double>> &x, int incx,
   1071                           std::complex<double> beta,
   1072                           DeviceMemory<std::complex<double>> *y, int incy) {
   1073   return DoBlasInternal(cublasZhemv, stream, true /* = pointer_mode_host */,
   1074                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1075                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
   1076                         incx, GpuComplex(&beta),
   1077                         GpuComplex(GpuMemoryMutable(y)), incy);
   1078 }
   1079 
   1080 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
   1081                          float alpha,
   1082                          const DeviceMemory<std::complex<float>> &x, int incx,
   1083                          DeviceMemory<std::complex<float>> *a, int lda) {
   1084   return DoBlasInternal(cublasCher, stream, true /* = pointer_mode_host */,
   1085                         CUDABlasUpperLower(uplo), n, &alpha,
   1086                         GpuComplex(GpuMemory(x)), incx,
   1087                         GpuComplex(GpuMemoryMutable(a)), lda);
   1088 }
   1089 
   1090 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
   1091                          double alpha,
   1092                          const DeviceMemory<std::complex<double>> &x, int incx,
   1093                          DeviceMemory<std::complex<double>> *a, int lda) {
   1094   return DoBlasInternal(cublasZher, stream, true /* = pointer_mode_host */,
   1095                         CUDABlasUpperLower(uplo), n, &alpha,
   1096                         GpuComplex(GpuMemory(x)), incx,
   1097                         GpuComplex(GpuMemoryMutable(a)), lda);
   1098 }
   1099 
   1100 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1101                           std::complex<float> alpha,
   1102                           const DeviceMemory<std::complex<float>> &x, int incx,
   1103                           const DeviceMemory<std::complex<float>> &y, int incy,
   1104                           DeviceMemory<std::complex<float>> *a, int lda) {
   1105   return DoBlasInternal(cublasCher2, stream, true /* = pointer_mode_host */,
   1106                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1107                         GpuComplex(GpuMemory(x)), incx,
   1108                         GpuComplex(GpuMemory(y)), incy,
   1109                         GpuComplex(GpuMemoryMutable(a)), lda);
   1110 }
   1111 
   1112 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1113                           std::complex<double> alpha,
   1114                           const DeviceMemory<std::complex<double>> &x, int incx,
   1115                           const DeviceMemory<std::complex<double>> &y, int incy,
   1116                           DeviceMemory<std::complex<double>> *a, int lda) {
   1117   return DoBlasInternal(cublasZher2, stream, true /* = pointer_mode_host */,
   1118                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1119                         GpuComplex(GpuMemory(x)), incx,
   1120                         GpuComplex(GpuMemory(y)), incy,
   1121                         GpuComplex(GpuMemoryMutable(a)), lda);
   1122 }
   1123 
   1124 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1125                           std::complex<float> alpha,
   1126                           const DeviceMemory<std::complex<float>> &ap,
   1127                           const DeviceMemory<std::complex<float>> &x, int incx,
   1128                           std::complex<float> beta,
   1129                           DeviceMemory<std::complex<float>> *y, int incy) {
   1130   return DoBlasInternal(cublasChpmv, stream, true /* = pointer_mode_host */,
   1131                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1132                         GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
   1133                         incx, GpuComplex(&beta),
   1134                         GpuComplex(GpuMemoryMutable(y)), incy);
   1135 }
   1136 
   1137 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1138                           std::complex<double> alpha,
   1139                           const DeviceMemory<std::complex<double>> &ap,
   1140                           const DeviceMemory<std::complex<double>> &x, int incx,
   1141                           std::complex<double> beta,
   1142                           DeviceMemory<std::complex<double>> *y, int incy) {
   1143   return DoBlasInternal(cublasZhpmv, stream, true /* = pointer_mode_host */,
   1144                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1145                         GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
   1146                         incx, GpuComplex(&beta),
   1147                         GpuComplex(GpuMemoryMutable(y)), incy);
   1148 }
   1149 
   1150 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
   1151                          float alpha,
   1152                          const DeviceMemory<std::complex<float>> &x, int incx,
   1153                          DeviceMemory<std::complex<float>> *ap) {
   1154   return DoBlasInternal(cublasChpr, stream, true /* = pointer_mode_host */,
   1155                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1156                         GpuComplex(GpuMemory(x)), incx,
   1157                         GpuComplex(GpuMemoryMutable(ap)));
   1158 }
   1159 
   1160 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
   1161                          double alpha,
   1162                          const DeviceMemory<std::complex<double>> &x, int incx,
   1163                          DeviceMemory<std::complex<double>> *ap) {
   1164   return DoBlasInternal(cublasZhpr, stream, true /* = pointer_mode_host */,
   1165                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
   1166                         GpuComplex(GpuMemory(x)), incx,
   1167                         GpuComplex(GpuMemoryMutable(ap)));
   1168 }
   1169 
   1170 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1171                           std::complex<float> alpha,
   1172                           const DeviceMemory<std::complex<float>> &x, int incx,
   1173                           const DeviceMemory<std::complex<float>> &y, int incy,
   1174                           DeviceMemory<std::complex<float>> *ap) {
   1175   return DoBlasInternal(
   1176       cublasChpr2, stream, true /* = pointer_mode_host */,
   1177       CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
   1178       incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
   1179 }
   1180 
   1181 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1182                           std::complex<double> alpha,
   1183                           const DeviceMemory<std::complex<double>> &x, int incx,
   1184                           const DeviceMemory<std::complex<double>> &y, int incy,
   1185                           DeviceMemory<std::complex<double>> *ap) {
   1186   return DoBlasInternal(
   1187       cublasZhpr2, stream, true /* = pointer_mode_host */,
   1188       CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
   1189       incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
   1190 }
   1191 
   1192 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1193                           uint64 k, float alpha, const DeviceMemory<float> &a,
   1194                           int lda, const DeviceMemory<float> &x, int incx,
   1195                           float beta, DeviceMemory<float> *y, int incy) {
   1196   return DoBlasInternal(cublasSsbmv, stream, true /* = pointer_mode_host */,
   1197                         CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
   1198                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
   1199                         incy);
   1200 }
   1201 
   1202 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1203                           uint64 k, double alpha, const DeviceMemory<double> &a,
   1204                           int lda, const DeviceMemory<double> &x, int incx,
   1205                           double beta, DeviceMemory<double> *y, int incy) {
   1206   return DoBlasInternal(cublasDsbmv, stream, true /* = pointer_mode_host */,
   1207                         CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
   1208                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
   1209                         incy);
   1210 }
   1211 
   1212 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1213                           float alpha, const DeviceMemory<float> &ap,
   1214                           const DeviceMemory<float> &x, int incx, float beta,
   1215                           DeviceMemory<float> *y, int incy) {
   1216   return DoBlasInternal(cublasSspmv, stream, true /* = pointer_mode_host */,
   1217                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
   1218                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
   1219 }
   1220 
   1221 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1222                           double alpha, const DeviceMemory<double> &ap,
   1223                           const DeviceMemory<double> &x, int incx, double beta,
   1224                           DeviceMemory<double> *y, int incy) {
   1225   return DoBlasInternal(cublasDspmv, stream, true /* = pointer_mode_host */,
   1226                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
   1227                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
   1228 }
   1229 
   1230 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
   1231                          float alpha, const DeviceMemory<float> &x, int incx,
   1232                          DeviceMemory<float> *ap) {
   1233   return DoBlasInternal(cublasSspr, stream, true /* = pointer_mode_host */,
   1234                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1235                         GpuMemoryMutable(ap));
   1236 }
   1237 
   1238 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
   1239                          double alpha, const DeviceMemory<double> &x, int incx,
   1240                          DeviceMemory<double> *ap) {
   1241   return DoBlasInternal(cublasDspr, stream, true /* = pointer_mode_host */,
   1242                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1243                         GpuMemoryMutable(ap));
   1244 }
   1245 
   1246 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1247                           float alpha, const DeviceMemory<float> &x, int incx,
   1248                           const DeviceMemory<float> &y, int incy,
   1249                           DeviceMemory<float> *ap) {
   1250   return DoBlasInternal(cublasSspr2, stream, true /* = pointer_mode_host */,
   1251                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1252                         GpuMemory(y), incy, GpuMemoryMutable(ap));
   1253 }
   1254 
   1255 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1256                           double alpha, const DeviceMemory<double> &x, int incx,
   1257                           const DeviceMemory<double> &y, int incy,
   1258                           DeviceMemory<double> *ap) {
   1259   return DoBlasInternal(cublasDspr2, stream, true /* = pointer_mode_host */,
   1260                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1261                         GpuMemory(y), incy, GpuMemoryMutable(ap));
   1262 }
   1263 
   1264 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1265                           float alpha, const DeviceMemory<float> &a, int lda,
   1266                           const DeviceMemory<float> &x, int incx, float beta,
   1267                           DeviceMemory<float> *y, int incy) {
   1268   return DoBlasInternal(cublasSsymv, stream, true /* = pointer_mode_host */,
   1269                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
   1270                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
   1271 }
   1272 
   1273 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
   1274                           double alpha, const DeviceMemory<double> &a, int lda,
   1275                           const DeviceMemory<double> &x, int incx, double beta,
   1276                           DeviceMemory<double> *y, int incy) {
   1277   return DoBlasInternal(cublasDsymv, stream, true /* = pointer_mode_host */,
   1278                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
   1279                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
   1280 }
   1281 
   1282 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
   1283                          float alpha, const DeviceMemory<float> &x, int incx,
   1284                          DeviceMemory<float> *a, int lda) {
   1285   return DoBlasInternal(cublasSsyr, stream, true /* = pointer_mode_host */,
   1286                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1287                         GpuMemoryMutable(a), lda);
   1288 }
   1289 
   1290 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
   1291                          double alpha, const DeviceMemory<double> &x, int incx,
   1292                          DeviceMemory<double> *a, int lda) {
   1293   return DoBlasInternal(cublasDsyr, stream, true /* = pointer_mode_host */,
   1294                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1295                         GpuMemoryMutable(a), lda);
   1296 }
   1297 
   1298 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1299                           float alpha, const DeviceMemory<float> &x, int incx,
   1300                           const DeviceMemory<float> &y, int incy,
   1301                           DeviceMemory<float> *a, int lda) {
   1302   return DoBlasInternal(cublasSsyr2, stream, true /* = pointer_mode_host */,
   1303                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1304                         GpuMemory(y), incy, GpuMemoryMutable(a), lda);
   1305 }
   1306 
   1307 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
   1308                           double alpha, const DeviceMemory<double> &x, int incx,
   1309                           const DeviceMemory<double> &y, int incy,
   1310                           DeviceMemory<double> *a, int lda) {
   1311   return DoBlasInternal(cublasDsyr2, stream, true /* = pointer_mode_host */,
   1312                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
   1313                         GpuMemory(y), incy, GpuMemoryMutable(a), lda);
   1314 }
   1315 
   1316 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   1317                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1318                           uint64 k, const DeviceMemory<float> &a, int lda,
   1319                           DeviceMemory<float> *x, int incx) {
   1320   return DoBlasInternal(cublasStbmv, stream, true /* = pointer_mode_host */,
   1321                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1322                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
   1323                         GpuMemoryMutable(x), incx);
   1324 }
   1325 
   1326 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   1327                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1328                           uint64 k, const DeviceMemory<double> &a, int lda,
   1329                           DeviceMemory<double> *x, int incx) {
   1330   return DoBlasInternal(cublasDtbmv, stream, true /* = pointer_mode_host */,
   1331                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1332                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
   1333                         GpuMemoryMutable(x), incx);
   1334 }
   1335 
   1336 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   1337                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1338                           uint64 k, const DeviceMemory<std::complex<float>> &a,
   1339                           int lda, DeviceMemory<std::complex<float>> *x,
   1340                           int incx) {
   1341   return DoBlasInternal(cublasCtbmv, stream, true /* = pointer_mode_host */,
   1342                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1343                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
   1344                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1345 }
   1346 
   1347 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   1348                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1349                           uint64 k, const DeviceMemory<std::complex<double>> &a,
   1350                           int lda, DeviceMemory<std::complex<double>> *x,
   1351                           int incx) {
   1352   return DoBlasInternal(cublasZtbmv, stream, true /* = pointer_mode_host */,
   1353                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1354                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
   1355                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1356 }
   1357 
   1358 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   1359                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1360                           uint64 k, const DeviceMemory<float> &a, int lda,
   1361                           DeviceMemory<float> *x, int incx) {
   1362   return DoBlasInternal(cublasStbsv, stream, true /* = pointer_mode_host */,
   1363                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1364                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
   1365                         GpuMemoryMutable(x), incx);
   1366 }
   1367 
   1368 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   1369                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1370                           uint64 k, const DeviceMemory<double> &a, int lda,
   1371                           DeviceMemory<double> *x, int incx) {
   1372   return DoBlasInternal(cublasDtbsv, stream, true /* = pointer_mode_host */,
   1373                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1374                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
   1375                         GpuMemoryMutable(x), incx);
   1376 }
   1377 
   1378 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   1379                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1380                           uint64 k, const DeviceMemory<std::complex<float>> &a,
   1381                           int lda, DeviceMemory<std::complex<float>> *x,
   1382                           int incx) {
   1383   return DoBlasInternal(cublasCtbsv, stream, true /* = pointer_mode_host */,
   1384                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1385                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
   1386                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1387 }
   1388 
   1389 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   1390                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1391                           uint64 k, const DeviceMemory<std::complex<double>> &a,
   1392                           int lda, DeviceMemory<std::complex<double>> *x,
   1393                           int incx) {
   1394   return DoBlasInternal(cublasZtbsv, stream, true /* = pointer_mode_host */,
   1395                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1396                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
   1397                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1398 }
   1399 
   1400 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   1401                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1402                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
   1403                           int incx) {
   1404   return DoBlasInternal(cublasStpmv, stream, true /* = pointer_mode_host */,
   1405                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1406                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
   1407                         GpuMemoryMutable(x), incx);
   1408 }
   1409 
   1410 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   1411                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1412                           const DeviceMemory<double> &ap,
   1413                           DeviceMemory<double> *x, int incx) {
   1414   return DoBlasInternal(cublasDtpmv, stream, true /* = pointer_mode_host */,
   1415                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1416                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
   1417                         GpuMemoryMutable(x), incx);
   1418 }
   1419 
   1420 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   1421                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1422                           const DeviceMemory<std::complex<float>> &ap,
   1423                           DeviceMemory<std::complex<float>> *x, int incx) {
   1424   return DoBlasInternal(cublasCtpmv, stream, true /* = pointer_mode_host */,
   1425                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1426                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
   1427                         GpuComplex(GpuMemoryMutable(x)), incx);
   1428 }
   1429 
   1430 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   1431                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1432                           const DeviceMemory<std::complex<double>> &ap,
   1433                           DeviceMemory<std::complex<double>> *x, int incx) {
   1434   return DoBlasInternal(cublasZtpmv, stream, true /* = pointer_mode_host */,
   1435                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1436                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
   1437                         GpuComplex(GpuMemoryMutable(x)), incx);
   1438 }
   1439 
   1440 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   1441                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1442                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
   1443                           int incx) {
   1444   return DoBlasInternal(cublasStpsv, stream, true /* = pointer_mode_host */,
   1445                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1446                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
   1447                         GpuMemoryMutable(x), incx);
   1448 }
   1449 
   1450 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   1451                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1452                           const DeviceMemory<double> &ap,
   1453                           DeviceMemory<double> *x, int incx) {
   1454   return DoBlasInternal(cublasDtpsv, stream, true /* = pointer_mode_host */,
   1455                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1456                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
   1457                         GpuMemoryMutable(x), incx);
   1458 }
   1459 
   1460 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   1461                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1462                           const DeviceMemory<std::complex<float>> &ap,
   1463                           DeviceMemory<std::complex<float>> *x, int incx) {
   1464   return DoBlasInternal(cublasCtpsv, stream, true /* = pointer_mode_host */,
   1465                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1466                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
   1467                         GpuComplex(GpuMemoryMutable(x)), incx);
   1468 }
   1469 
   1470 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   1471                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1472                           const DeviceMemory<std::complex<double>> &ap,
   1473                           DeviceMemory<std::complex<double>> *x, int incx) {
   1474   return DoBlasInternal(cublasZtpsv, stream, true /* = pointer_mode_host */,
   1475                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1476                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
   1477                         GpuComplex(GpuMemoryMutable(x)), incx);
   1478 }
   1479 
   1480 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   1481                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1482                           const DeviceMemory<float> &a, int lda,
   1483                           DeviceMemory<float> *x, int incx) {
   1484   return DoBlasInternal(cublasStrmv, stream, true /* = pointer_mode_host */,
   1485                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1486                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
   1487                         GpuMemoryMutable(x), incx);
   1488 }
   1489 
   1490 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   1491                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1492                           const DeviceMemory<double> &a, int lda,
   1493                           DeviceMemory<double> *x, int incx) {
   1494   return DoBlasInternal(cublasDtrmv, stream, true /* = pointer_mode_host */,
   1495                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1496                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
   1497                         GpuMemoryMutable(x), incx);
   1498 }
   1499 
   1500 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   1501                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1502                           const DeviceMemory<std::complex<float>> &a, int lda,
   1503                           DeviceMemory<std::complex<float>> *x, int incx) {
   1504   return DoBlasInternal(cublasCtrmv, stream, true /* = pointer_mode_host */,
   1505                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1506                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
   1507                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1508 }
   1509 
   1510 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   1511                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1512                           const DeviceMemory<std::complex<double>> &a, int lda,
   1513                           DeviceMemory<std::complex<double>> *x, int incx) {
   1514   return DoBlasInternal(cublasZtrmv, stream, true /* = pointer_mode_host */,
   1515                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1516                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
   1517                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1518 }
   1519 
   1520 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   1521                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1522                           const DeviceMemory<float> &a, int lda,
   1523                           DeviceMemory<float> *x, int incx) {
   1524   return DoBlasInternal(cublasStrsv, stream, true /* = pointer_mode_host */,
   1525                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1526                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
   1527                         GpuMemoryMutable(x), incx);
   1528 }
   1529 
   1530 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   1531                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1532                           const DeviceMemory<double> &a, int lda,
   1533                           DeviceMemory<double> *x, int incx) {
   1534   return DoBlasInternal(cublasDtrsv, stream, true /* = pointer_mode_host */,
   1535                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1536                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
   1537                         GpuMemoryMutable(x), incx);
   1538 }
   1539 
   1540 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   1541                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1542                           const DeviceMemory<std::complex<float>> &a, int lda,
   1543                           DeviceMemory<std::complex<float>> *x, int incx) {
   1544   return DoBlasInternal(cublasCtrsv, stream, true /* = pointer_mode_host */,
   1545                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1546                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
   1547                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1548 }
   1549 
   1550 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   1551                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
   1552                           const DeviceMemory<std::complex<double>> &a, int lda,
   1553                           DeviceMemory<std::complex<double>> *x, int incx) {
   1554   return DoBlasInternal(cublasZtrsv, stream, true /* = pointer_mode_host */,
   1555                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
   1556                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
   1557                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
   1558 }
   1559 
   1560 bool CUDABlas::DoBlasGemm(
   1561     Stream *stream, blas::Transpose transa,
   1562     blas::Transpose transb, uint64 m, uint64 n, uint64 k,
   1563     float alpha, const DeviceMemory<Eigen::half> &a, int lda,
   1564     const DeviceMemory<Eigen::half> &b, int ldb, float beta,
   1565     DeviceMemory<Eigen::half> *c, int ldc) {
   1566 #if CUDA_VERSION >= 7050
   1567   VLOG(1) << port::Printf(
   1568       "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
   1569       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
   1570       "c=%p ldc=%d",
   1571       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
   1572       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
   1573   if (transa == blas::Transpose::kNoTranspose) {
   1574     if (lda < static_cast<int64>(m)) {
   1575       LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
   1576                       "precondition violation";
   1577     }
   1578   } else {
   1579     if (lda < static_cast<int64>(k)) {
   1580       LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
   1581                    << ") (transpose case); precondition violation";
   1582     }
   1583   }
   1584   if (transb == blas::Transpose::kNoTranspose) {
   1585     if (ldb < static_cast<int64>(k)) {
   1586       LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
   1587                    << ") (no transpose case); precondition violation";
   1588     }
   1589   } else {
   1590     if (ldb < static_cast<int64>(n)) {
   1591       LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
   1592                       "precondition violation";
   1593     }
   1594   }
   1595 
   1596   bool use_tensor_ops = false;
   1597 #if CUDA_VERSION >= 9000
   1598   int cc_major, cc_minor;
   1599   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
   1600                                                                    &cc_minor);
   1601 
   1602   // GPUs < sm_70 don't support tensor ops.
   1603   if (cc_major >= 7 && TensorOpMathEnabled()) {
   1604     use_tensor_ops = true;
   1605   }
   1606 #endif
   1607 
   1608   return DoBlasInternalImpl(
   1609       cublasSgemmEx, stream, true /* = pointer_mode_host */,
   1610       true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
   1611       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
   1612       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
   1613       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
   1614 
   1615 #else
   1616   LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
   1617              << "(need at least CUDA 7.5)";
   1618   return false;
   1619 #endif
   1620 }
   1621 
   1622 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   1623                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
   1624                           float alpha, const DeviceMemory<float> &a, int lda,
   1625                           const DeviceMemory<float> &b, int ldb, float beta,
   1626                           DeviceMemory<float> *c, int ldc) {
   1627   VLOG(1) << port::Printf(
   1628       "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
   1629       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
   1630       "c=%p ldc=%d",
   1631       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
   1632       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
   1633   if (transa == blas::Transpose::kNoTranspose) {
   1634     if (lda < static_cast<int64>(m)) {
   1635       LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
   1636                       "precondition violation";
   1637     }
   1638   } else {
   1639     if (lda < static_cast<int64>(k)) {
   1640       LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
   1641                    << ") (transpose case); precondition violation";
   1642     }
   1643   }
   1644   if (transb == blas::Transpose::kNoTranspose) {
   1645     if (ldb < static_cast<int64>(k)) {
   1646       LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
   1647                    << ") (no transpose case); precondition violation";
   1648     }
   1649   } else {
   1650     if (ldb < static_cast<int64>(n)) {
   1651       LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
   1652                       "precondition violation";
   1653     }
   1654   }
   1655   return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */,
   1656                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
   1657                         n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
   1658                         &beta, GpuMemoryMutable(c), ldc);
   1659 }
   1660 
   1661 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   1662                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
   1663                           double alpha, const DeviceMemory<double> &a, int lda,
   1664                           const DeviceMemory<double> &b, int ldb, double beta,
   1665                           DeviceMemory<double> *c, int ldc) {
   1666   return DoBlasInternal(cublasDgemm, stream, true /* = pointer_mode_host */,
   1667                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
   1668                         n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
   1669                         &beta, GpuMemoryMutable(c), ldc);
   1670 }
   1671 
   1672 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   1673                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
   1674                           std::complex<float> alpha,
   1675                           const DeviceMemory<std::complex<float>> &a, int lda,
   1676                           const DeviceMemory<std::complex<float>> &b, int ldb,
   1677                           std::complex<float> beta,
   1678                           DeviceMemory<std::complex<float>> *c, int ldc) {
   1679   return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
   1680                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
   1681                         n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   1682                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   1683                         GpuComplex(GpuMemoryMutable(c)), ldc);
   1684 }
   1685 
   1686 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   1687                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
   1688                           std::complex<double> alpha,
   1689                           const DeviceMemory<std::complex<double>> &a, int lda,
   1690                           const DeviceMemory<std::complex<double>> &b, int ldb,
   1691                           std::complex<double> beta,
   1692                           DeviceMemory<std::complex<double>> *c, int ldc) {
   1693   return DoBlasInternal(cublasZgemm, stream, true /* = pointer_mode_host */,
   1694                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
   1695                         n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   1696                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   1697                         GpuComplex(GpuMemoryMutable(c)), ldc);
   1698 }
   1699 
   1700 bool CUDABlas::DoBlasGemvWithProfiling(
   1701     Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
   1702     const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
   1703     int incx, float beta, DeviceMemory<float> *y, int incy,
   1704     blas::ProfileResult *output_profile_result) {
   1705   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
   1706                                      incx, beta, y, incy,
   1707                                      output_profile_result);
   1708 }
   1709 
   1710 bool CUDABlas::DoBlasGemvWithProfiling(
   1711     Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
   1712     const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
   1713     int incx, double beta, DeviceMemory<double> *y, int incy,
   1714     blas::ProfileResult *output_profile_result) {
   1715   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
   1716                                      incx, beta, y, incy,
   1717                                      output_profile_result);
   1718 }
   1719 
   1720 bool CUDABlas::DoBlasGemvWithProfiling(
   1721     Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
   1722     std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
   1723     int lda, const DeviceMemory<std::complex<float>> &x, int incx,
   1724     std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
   1725     blas::ProfileResult *output_profile_result) {
   1726   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
   1727                                      incx, beta, y, incy,
   1728                                      output_profile_result);
   1729 }
   1730 
   1731 bool CUDABlas::DoBlasGemvWithProfiling(
   1732     Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
   1733     std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
   1734     int lda, const DeviceMemory<std::complex<double>> &x, int incx,
   1735     std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
   1736     blas::ProfileResult *output_profile_result) {
   1737   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
   1738                                      incx, beta, y, incy,
   1739                                      output_profile_result);
   1740 }
   1741 
   1742 bool CUDABlas::DoBlasGemmWithProfiling(
   1743     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1744     uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
   1745     int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
   1746     DeviceMemory<Eigen::half> *c, int ldc,
   1747     blas::ProfileResult *output_profile_result) {
   1748   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
   1749                                      lda, b, ldb, beta, c, ldc,
   1750                                      output_profile_result);
   1751 }
   1752 
   1753 bool CUDABlas::DoBlasGemmWithProfiling(
   1754     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1755     uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
   1756     const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
   1757     int ldc, blas::ProfileResult *output_profile_result) {
   1758   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
   1759                                      lda, b, ldb, beta, c, ldc,
   1760                                      output_profile_result);
   1761 }
   1762 
   1763 bool CUDABlas::DoBlasGemmWithProfiling(
   1764     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1765     uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
   1766     const DeviceMemory<double> &b, int ldb, double beta,
   1767     DeviceMemory<double> *c, int ldc,
   1768     blas::ProfileResult *output_profile_result) {
   1769   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
   1770                                      lda, b, ldb, beta, c, ldc,
   1771                                      output_profile_result);
   1772 }
   1773 
   1774 bool CUDABlas::DoBlasGemmWithProfiling(
   1775     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1776     uint64 n, uint64 k, std::complex<float> alpha,
   1777     const DeviceMemory<std::complex<float>> &a, int lda,
   1778     const DeviceMemory<std::complex<float>> &b, int ldb,
   1779     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
   1780     blas::ProfileResult *output_profile_result) {
   1781   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
   1782                                      lda, b, ldb, beta, c, ldc,
   1783                                      output_profile_result);
   1784 }
   1785 
   1786 bool CUDABlas::DoBlasGemmWithProfiling(
   1787     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1788     uint64 n, uint64 k, std::complex<double> alpha,
   1789     const DeviceMemory<std::complex<double>> &a, int lda,
   1790     const DeviceMemory<std::complex<double>> &b, int ldb,
   1791     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
   1792     blas::ProfileResult *output_profile_result) {
   1793   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
   1794                                      lda, b, ldb, beta, c, ldc,
   1795                                      output_profile_result);
   1796 }
   1797 
   1798 template <typename T>
   1799 bool CUDABlas::DoBlasGemvWithProfilingImpl(
   1800     Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
   1801     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
   1802     const T &beta, DeviceMemory<T> *y, int incy,
   1803     blas::ProfileResult *output_profile_result) {
   1804   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   1805   if (output_profile_result != nullptr) {
   1806     timer.reset(new GpuTimer(parent_));
   1807     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
   1808       return false;
   1809     }
   1810   }
   1811 
   1812   // Call blasGemm
   1813   bool result =
   1814       DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
   1815 
   1816   if (timer != nullptr && result) {
   1817     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
   1818     // state.
   1819     if (!timer->Stop(AsGpuStream(stream))) {
   1820       return false;
   1821     }
   1822     output_profile_result->set_is_valid(true);
   1823     output_profile_result->set_algorithm(blas::kDefaultBlasGemv);
   1824     output_profile_result->set_elapsed_time_in_ms(
   1825         timer->GetElapsedMilliseconds());
   1826   }
   1827   return result;
   1828 }
   1829 
   1830 template <typename T, typename ParamType>
   1831 bool CUDABlas::DoBlasGemmWithProfilingImpl(
   1832     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1833     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
   1834     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
   1835     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
   1836   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   1837   if (output_profile_result != nullptr) {
   1838     timer.reset(new GpuTimer(parent_));
   1839     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
   1840       return false;
   1841     }
   1842   }
   1843 
   1844   // Call blasGemm
   1845   bool result = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b,
   1846                            ldb, beta, c, ldc);
   1847 
   1848   if (timer != nullptr && result) {
   1849     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
   1850     // state.
   1851     if (!timer->Stop(AsGpuStream(stream))) {
   1852       return false;
   1853     }
   1854     output_profile_result->set_is_valid(true);
   1855     output_profile_result->set_algorithm(blas::kDefaultBlasGemm);
   1856     output_profile_result->set_elapsed_time_in_ms(
   1857         timer->GetElapsedMilliseconds());
   1858   }
   1859   return result;
   1860 }
   1861 
   1862 static bool UsesTensorOps(blas::AlgorithmType algo) {
   1863 #if CUDA_VERSION >= 9000
   1864   cublasGemmAlgo_t cublas_algo = static_cast<cublasGemmAlgo_t>(algo);
   1865   return cublas_algo >= CUBLAS_GEMM_DEFAULT_TENSOR_OP;
   1866 #else
   1867   return false;
   1868 #endif
   1869 }
   1870 
   1871 template <typename InType>
   1872 static bool TensorOpsAvailable(int cc_major) {
   1873 #if CUDA_VERSION >= 9000
   1874   // cublas *does* allow tensor ops on inputs that are not fp16, so this is not
   1875   // strictly correct.  We can't simply enable it, though, as that would change
   1876   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
   1877   // to be rounded to fp16.
   1878   if (cc_major >= 7 && TensorOpMathEnabled() &&
   1879       std::is_same<InType, Eigen::half>::value) {
   1880     return true;
   1881   }
   1882 #endif
   1883   return false;
   1884 }
   1885 
   1886 template <typename InT, typename OutT, typename CompT>
   1887 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   1888     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   1889     uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
   1890     const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
   1891     const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
   1892     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
   1893     blas::ProfileResult *output_profile_result) {
   1894   // GPUs < sm_50 don't support cublasGemmEx.
   1895   int cc_major, cc_minor;
   1896   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
   1897           &cc_major, &cc_minor) &&
   1898       cc_major < 5) {
   1899     VLOG(2) << "DoBlasGemmWithAlgorithm returning false because sm" << cc_major
   1900             << cc_minor << " devices don't support explicit gemm algorithms.";
   1901     return false;
   1902   }
   1903 
   1904   if (UsesTensorOps(algorithm) && !TensorOpsAvailable<InT>(cc_major)) {
   1905     if (std::is_same<InT, Eigen::half>::value) {
   1906       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
   1907               << algorithm
   1908               << " uses tensor ops, but tensor ops are not available in sm"
   1909               << cc_major << "X devices.";
   1910     } else {
   1911       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
   1912               << algorithm
   1913               << " uses tensor ops, but the input data type is not fp16.";
   1914     }
   1915     return false;
   1916   }
   1917 
   1918   // Either both 'alpha' and 'beta' need to be pointers to device memory, or
   1919   // they need to be both host scalars.
   1920   if (alpha.is_pointer() != beta.is_pointer()) {
   1921     VLOG(2) << "DoBlasGemmWithAlgorithm returning false because one of `alpha` "
   1922                "and `beta` is a pointer, but the other is not.";
   1923     return false;
   1924   }
   1925 
   1926   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   1927   if (output_profile_result != nullptr) {
   1928     timer.reset(new GpuTimer(parent_));
   1929     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
   1930       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because "
   1931                  "output_profile_result was given, but we were unable to "
   1932                  "create a GpuTimer.";
   1933       return false;
   1934     }
   1935   }
   1936 
   1937   // Return false if we might be hitting a cuBLAS bug that produces the wrong
   1938   // result. See nvbugs/2156201, b/79126339.
   1939 #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
   1940   if ((algorithm == CUBLAS_GEMM_DEFAULT || algorithm >= CUBLAS_GEMM_ALGO13) &&
   1941       std::max({m, n, k}) >= 2097153 && cc_major < 7) {
   1942     VLOG(2) << "DoBlasGemmWithAlgorithm returning false to work around cudnn "
   1943                "<9.2 bug with m, n, or k >= 2097153.  See b/79126339.";
   1944     return false;
   1945   }
   1946 #endif
   1947 
   1948   cudaDataType_t cuda_in_type = CUDADataType<InT>::type;
   1949   // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
   1950   // we do the following compile-time check on the default value:
   1951   static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, "");
   1952   // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we
   1953   // essentially reinterpet_cast to __half, which is safe because Eigen::half
   1954   // inherits from __half.
   1955   bool result = DoBlasInternalFailureOK(
   1956       cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
   1957       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
   1958       alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
   1959       GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
   1960       beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
   1961       GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc,
   1962       CUDAComputationType(computation_type),
   1963       static_cast<cublasGemmAlgo_t>(algorithm));
   1964 
   1965   if (timer != nullptr && result) {
   1966     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
   1967     // state.
   1968     if (!timer->Stop(AsGpuStream(stream))) {
   1969       VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop "
   1970                  "GpuTimer.";
   1971       return false;
   1972     }
   1973     output_profile_result->set_is_valid(true);
   1974     output_profile_result->set_algorithm(algorithm);
   1975     output_profile_result->set_elapsed_time_in_ms(
   1976         timer->GetElapsedMilliseconds());
   1977   }
   1978   return result;
   1979 }
   1980 
   1981 bool CUDABlas::GetBlasGemmAlgorithms(
   1982     std::vector<blas::AlgorithmType> *out_algorithms) {
   1983   // cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx)
   1984   // were first introduced in CUDA 8.
   1985   //
   1986   // Note that when CUDA version and compute capability is not sufficient, we
   1987   // still return the out_algorithms. Caller needs to make sure that in this
   1988   // case, the returned vector is empty.
   1989   *out_algorithms = {
   1990     CUBLAS_GEMM_DFALT,
   1991     CUBLAS_GEMM_ALGO0,
   1992     CUBLAS_GEMM_ALGO1,
   1993     CUBLAS_GEMM_ALGO2,
   1994     CUBLAS_GEMM_ALGO3,
   1995     CUBLAS_GEMM_ALGO4,
   1996     CUBLAS_GEMM_ALGO5,
   1997     CUBLAS_GEMM_ALGO6,
   1998     CUBLAS_GEMM_ALGO7,
   1999 #if CUDA_VERSION >= 9000
   2000     CUBLAS_GEMM_ALGO8,
   2001     CUBLAS_GEMM_ALGO9,
   2002     CUBLAS_GEMM_ALGO10,
   2003     CUBLAS_GEMM_ALGO11,
   2004     CUBLAS_GEMM_ALGO12,
   2005     CUBLAS_GEMM_ALGO13,
   2006     CUBLAS_GEMM_ALGO14,
   2007     CUBLAS_GEMM_ALGO15,
   2008     CUBLAS_GEMM_ALGO16,
   2009     CUBLAS_GEMM_ALGO17,
   2010     CUBLAS_GEMM_DFALT_TENSOR_OP,
   2011     CUBLAS_GEMM_ALGO0_TENSOR_OP,
   2012     CUBLAS_GEMM_ALGO1_TENSOR_OP,
   2013     CUBLAS_GEMM_ALGO2_TENSOR_OP,
   2014     CUBLAS_GEMM_ALGO3_TENSOR_OP,
   2015     CUBLAS_GEMM_ALGO4_TENSOR_OP,
   2016 #endif
   2017 #if CUDA_VERSION >= 9020
   2018     CUBLAS_GEMM_ALGO18,
   2019     CUBLAS_GEMM_ALGO19,
   2020     CUBLAS_GEMM_ALGO20,
   2021     CUBLAS_GEMM_ALGO21,
   2022     CUBLAS_GEMM_ALGO22,
   2023     CUBLAS_GEMM_ALGO23,
   2024     CUBLAS_GEMM_ALGO5_TENSOR_OP,
   2025     CUBLAS_GEMM_ALGO6_TENSOR_OP,
   2026     CUBLAS_GEMM_ALGO7_TENSOR_OP,
   2027     CUBLAS_GEMM_ALGO8_TENSOR_OP,
   2028     CUBLAS_GEMM_ALGO9_TENSOR_OP,
   2029     CUBLAS_GEMM_ALGO10_TENSOR_OP,
   2030     CUBLAS_GEMM_ALGO11_TENSOR_OP,
   2031     CUBLAS_GEMM_ALGO12_TENSOR_OP,
   2032     CUBLAS_GEMM_ALGO13_TENSOR_OP,
   2033     CUBLAS_GEMM_ALGO14_TENSOR_OP,
   2034     CUBLAS_GEMM_ALGO15_TENSOR_OP,
   2035 #endif
   2036   };
   2037   return true;
   2038 }
   2039 
   2040 bool CUDABlas::DoBlasGemmWithAlgorithm(
   2041     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2042     uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
   2043     const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
   2044     const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
   2045     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
   2046     blas::ProfileResult *output_profile_result) {
   2047   return DoBlasGemmWithAlgorithmImpl(
   2048       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
   2049       computation_type, algorithm, output_profile_result);
   2050 }
   2051 
   2052 bool CUDABlas::DoBlasGemmWithAlgorithm(
   2053     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2054     uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
   2055     const DeviceMemory<Eigen::half> &a, int lda,
   2056     const DeviceMemory<Eigen::half> &b, int ldb,
   2057     const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
   2058     int ldc, blas::ComputationType computation_type,
   2059     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   2060   if (computation_type == blas::ComputationType::kF32) {
   2061     if (alpha.is_pointer() || beta.is_pointer()) {
   2062       // We cannot easily convert a pointer to f16 memory to a pointer to f32
   2063       // memory from here, so we don't support this for now.
   2064       // TODO(akuegel): Investigate whether we can do the conversion before
   2065       // calling DoBlasGemmWithAlgorithm.
   2066       return false;
   2067     }
   2068     HostOrDeviceScalar<float> float_alpha(static_cast<float>(alpha.value()));
   2069     HostOrDeviceScalar<float> float_beta(static_cast<float>(beta.value()));
   2070     return DoBlasGemmWithAlgorithmImpl(
   2071         stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb,
   2072         float_beta, c, ldc, computation_type, algorithm, output_profile_result);
   2073   }
   2074 
   2075   CHECK_EQ(computation_type, blas::ComputationType::kF16);
   2076   return DoBlasGemmWithAlgorithmImpl(
   2077       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
   2078       computation_type, algorithm, output_profile_result);
   2079 }
   2080 
   2081 bool CUDABlas::DoBlasGemmWithAlgorithm(
   2082     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2083     uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
   2084     const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
   2085     int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
   2086     int ldc, blas::ComputationType computation_type,
   2087     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   2088   return DoBlasGemmWithAlgorithmImpl(
   2089       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
   2090       computation_type, algorithm, output_profile_result);
   2091 }
   2092 
   2093 bool CUDABlas::DoBlasGemmWithAlgorithm(
   2094     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2095     uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
   2096     const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
   2097     int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
   2098     int ldc, blas::ComputationType computation_type,
   2099     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   2100   return DoBlasGemmWithAlgorithmImpl(
   2101       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
   2102       computation_type, algorithm, output_profile_result);
   2103 }
   2104 
   2105 bool CUDABlas::DoBlasGemmWithAlgorithm(
   2106     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2107     uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
   2108     const DeviceMemory<std::complex<float>> &a, int lda,
   2109     const DeviceMemory<std::complex<float>> &b, int ldb,
   2110     const HostOrDeviceScalar<std::complex<float>> &beta,
   2111     DeviceMemory<std::complex<float>> *c, int ldc,
   2112     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
   2113     blas::ProfileResult *output_profile_result) {
   2114   return DoBlasGemmWithAlgorithmImpl(
   2115       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
   2116       computation_type, algorithm, output_profile_result);
   2117 }
   2118 
   2119 bool CUDABlas::DoBlasGemmWithAlgorithm(
   2120     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2121     uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
   2122     const DeviceMemory<std::complex<double>> &a, int lda,
   2123     const DeviceMemory<std::complex<double>> &b, int ldb,
   2124     const HostOrDeviceScalar<std::complex<double>> &beta,
   2125     DeviceMemory<std::complex<double>> *c, int ldc,
   2126     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
   2127     blas::ProfileResult *output_profile_result) {
   2128   return DoBlasGemmWithAlgorithmImpl(
   2129       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
   2130       computation_type, algorithm, output_profile_result);
   2131 }
   2132 
   2133 template <typename T>
   2134 struct HalfAsFloat {
   2135   typedef T type;
   2136 };
   2137 
   2138 template <>
   2139 struct HalfAsFloat<Eigen::half> {
   2140   typedef float type;
   2141 };
   2142 
   2143 template <typename T, typename Scalar, typename FuncT>
   2144 port::Status CUDABlas::DoBlasGemmBatchedInternal(
   2145     FuncT cublas_func, Stream *stream, blas::Transpose transa,
   2146     blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
   2147     const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
   2148     const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
   2149     Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
   2150     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   2151   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   2152   for (int i = 0; i < batch_count; ++i) {
   2153     a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
   2154     b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque()));
   2155     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   2156   }
   2157 
   2158   typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T;
   2159 
   2160   const size_t size = batch_count * sizeof(CUDA_T *);
   2161 
   2162   // Device-side copy of pointers to matrices.
   2163   DeviceMemory<CUDA_T *> a;
   2164   DeviceMemory<CUDA_T *> b;
   2165   DeviceMemory<CUDA_T *> c;
   2166 
   2167   // If temporary space is allocated for device-side copies of pointers to
   2168   // matrices, that temporary space should not be freed until this function
   2169   // returns. Although the values for these unique_ptrs are not set here, they
   2170   // are declared at this scope so they will be destroyed when the function
   2171   // returns.
   2172   //
   2173   // If a scratch allocator is provided, these pointers will not be used at all.
   2174   std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary;
   2175   std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary;
   2176   std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary;
   2177 
   2178   // Decide how to allocate device-side copy of pointers to matrices based on
   2179   // whether a scratch allocator was passed.
   2180   if (scratch_allocator != nullptr) {
   2181     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes,
   2182                         scratch_allocator->AllocateBytes(stream, size));
   2183     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes,
   2184                         scratch_allocator->AllocateBytes(stream, size));
   2185     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes,
   2186                         scratch_allocator->AllocateBytes(stream, size));
   2187     a = DeviceMemory<CUDA_T *>(a_bytes);
   2188     b = DeviceMemory<CUDA_T *>(b_bytes);
   2189     c = DeviceMemory<CUDA_T *>(c_bytes);
   2190   } else {
   2191     SE_ASSIGN_OR_RETURN(a_temporary,
   2192                         stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
   2193     SE_ASSIGN_OR_RETURN(b_temporary,
   2194                         stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
   2195     SE_ASSIGN_OR_RETURN(c_temporary,
   2196                         stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
   2197     a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory());
   2198     b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory());
   2199     c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory());
   2200   }
   2201 
   2202   if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
   2203       !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
   2204       !stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) {
   2205     return port::Status(port::error::INTERNAL,
   2206                         "failed to copy memory from host to device in "
   2207                         "CUDABlas::DoBlasGemmBatched");
   2208   }
   2209 
   2210   cudaDataType_t data_type = CUDADataType<T>::type;
   2211 
   2212 #if CUDA_VERSION >= 9010
   2213   int cc_major, cc_minor;
   2214   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
   2215           &cc_major, &cc_minor) &&
   2216       cc_major >= 5) {
   2217     bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
   2218     cublasGemmAlgo_t algo =
   2219         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
   2220     cudaDataType_t compute_type =
   2221         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
   2222     const void **a_void_ptrs = reinterpret_cast<const void **>(
   2223         const_cast<const CUDA_T **>(GpuMemory(a)));
   2224     const void **b_void_ptrs = reinterpret_cast<const void **>(
   2225         const_cast<const CUDA_T **>(GpuMemory(b)));
   2226     void **c_void_ptrs =
   2227         reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c)));
   2228     bool ok;
   2229     ok = DoBlasInternalImpl(
   2230         cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
   2231         true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
   2232         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
   2233         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
   2234         batch_count, compute_type, algo);
   2235     if (ok) {
   2236       return port::Status::OK();
   2237     }
   2238     return port::Status(port::error::INTERNAL,
   2239                         "failed BLAS call, see log for details");
   2240   }
   2241 #endif
   2242   // either CUDA_VERSION < 9.1 or SM < 5.0
   2243   if (data_type != CUDA_R_16F) {
   2244     bool ok = DoBlasInternal(
   2245         cublas_func, stream, true /* = pointer_mode_host */,
   2246         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
   2247         GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
   2248         const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta),
   2249         const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
   2250     if (ok) {
   2251       return port::Status::OK();
   2252     }
   2253     return port::Status(port::error::INTERNAL,
   2254                         "failed BLAS call, see log for details");
   2255   } else {
   2256     // Fall back to a loop for fp16
   2257     for (int b = 0; b < batch_count; ++b) {
   2258       const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
   2259       const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
   2260       DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
   2261       bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
   2262                            lda, b_matrix, ldb, beta, c_matrix, ldc);
   2263       if (!ok) {
   2264         return port::Status(port::error::INTERNAL,
   2265                             "failed BLAS call, see log for details");
   2266       }
   2267     }
   2268     return port::Status::OK();
   2269   }
   2270 }
   2271 
   2272 bool CUDABlas::DoBlasGemmBatched(
   2273     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2274     uint64 n, uint64 k, float alpha,
   2275     const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
   2276     const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
   2277     float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
   2278     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   2279   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   2280   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
   2281   port::Status status = DoBlasGemmBatchedInternal(
   2282       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
   2283       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   2284   if (!status.ok()) {
   2285     LOG(ERROR) << status;
   2286   }
   2287   return status.ok();
   2288 }
   2289 
   2290 bool CUDABlas::DoBlasGemmBatched(
   2291     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2292     uint64 n, uint64 k, float alpha,
   2293     const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
   2294     const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
   2295     const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
   2296     int batch_count, ScratchAllocator *scratch_allocator) {
   2297   port::Status status = DoBlasGemmBatchedInternal(
   2298       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
   2299       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   2300   if (!status.ok()) {
   2301     LOG(ERROR) << status;
   2302   }
   2303   return status.ok();
   2304 }
   2305 
   2306 bool CUDABlas::DoBlasGemmBatched(
   2307     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2308     uint64 n, uint64 k, double alpha,
   2309     const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
   2310     const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
   2311     double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
   2312     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   2313   port::Status status = DoBlasGemmBatchedInternal(
   2314       cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
   2315       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   2316   if (!status.ok()) {
   2317     LOG(ERROR) << status;
   2318   }
   2319   return status.ok();
   2320 }
   2321 
   2322 bool CUDABlas::DoBlasGemmBatched(
   2323     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2324     uint64 n, uint64 k, std::complex<float> alpha,
   2325     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
   2326     int lda,
   2327     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
   2328     int ldb, std::complex<float> beta,
   2329     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
   2330     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   2331   port::Status status = DoBlasGemmBatchedInternal(
   2332       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
   2333       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   2334   if (!status.ok()) {
   2335     LOG(ERROR) << status;
   2336   }
   2337   return status.ok();
   2338 }
   2339 
   2340 bool CUDABlas::DoBlasGemmBatched(
   2341     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2342     uint64 n, uint64 k, std::complex<double> alpha,
   2343     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
   2344     int lda,
   2345     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
   2346     int ldb, std::complex<double> beta,
   2347     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
   2348     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   2349   port::Status status = DoBlasGemmBatchedInternal(
   2350       cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
   2351       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   2352   if (!status.ok()) {
   2353     LOG(ERROR) << status;
   2354   }
   2355   return status.ok();
   2356 }
   2357 
   2358 bool CUDABlas::DoBlasGemmStridedBatched(
   2359     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2360     uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
   2361     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
   2362     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
   2363     int64 stride_c, int batch_count) {
   2364   bool use_tensor_ops = false;
   2365 #if CUDA_VERSION >= 9000
   2366   int cc_major, cc_minor;
   2367   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
   2368           &cc_major, &cc_minor)) {
   2369     // GPUs < sm_70 don't support tensor ops.
   2370     if (cc_major >= 7 && TensorOpMathEnabled()) {
   2371       use_tensor_ops = true;
   2372     }
   2373 #if CUDA_VERSION >= 9010
   2374     if (cc_major >= 5) {
   2375       cublasGemmAlgo_t algo =
   2376           (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
   2377       bool ok = DoBlasInternalImpl(
   2378           cublasGemmStridedBatchedEx, stream, true /* = pointer_mode_host */,
   2379           true /* = err_on_failure */, use_tensor_ops,
   2380           CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
   2381           GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F,
   2382           ldb, stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
   2383           batch_count, CUDA_R_32F, algo);
   2384       if (ok) {
   2385         return true;
   2386       }
   2387       LOG(ERROR) << "failed BLAS call, see log for details";
   2388       return false;
   2389     }
   2390 #endif
   2391   }
   2392 #endif
   2393   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
   2394   for (int batch = 0; batch < batch_count; ++batch) {
   2395     const auto *a_matrix =
   2396         reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a);
   2397     const auto *b_matrix =
   2398         reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b);
   2399     auto *c_matrix =
   2400         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
   2401     bool ok = DoBlasInternalImpl(
   2402         cublasSgemmEx, stream, true /* = pointer_mode_host */,
   2403         true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
   2404         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
   2405         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
   2406         SE_CUDA_DATA_HALF, ldc);
   2407     if (!ok) {
   2408       LOG(ERROR) << "failed BLAS call, see log for details";
   2409       return false;
   2410     }
   2411   }
   2412   return true;
   2413 }
   2414 
   2415 bool CUDABlas::DoBlasGemmStridedBatched(
   2416     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2417     uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
   2418     int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
   2419     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
   2420     int batch_count) {
   2421   return DoBlasInternal(
   2422       cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
   2423       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
   2424       GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
   2425       GpuMemoryMutable(c), ldc, stride_c, batch_count);
   2426 }
   2427 
   2428 bool CUDABlas::DoBlasGemmStridedBatched(
   2429     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2430     uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
   2431     int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
   2432     double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
   2433     int batch_count) {
   2434   return DoBlasInternal(
   2435       cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
   2436       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
   2437       GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
   2438       GpuMemoryMutable(c), ldc, stride_c, batch_count);
   2439 }
   2440 
   2441 bool CUDABlas::DoBlasGemmStridedBatched(
   2442     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2443     uint64 n, uint64 k, std::complex<float> alpha,
   2444     const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a,
   2445     const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
   2446     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
   2447     int64 stride_c, int batch_count) {
   2448   return DoBlasInternal(
   2449       cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
   2450       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
   2451       GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
   2452       GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
   2453       GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
   2454 }
   2455 
   2456 bool CUDABlas::DoBlasGemmStridedBatched(
   2457     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
   2458     uint64 n, uint64 k, std::complex<double> alpha,
   2459     const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a,
   2460     const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
   2461     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
   2462     int64 stride_c, int batch_count) {
   2463   return DoBlasInternal(
   2464       cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
   2465       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
   2466       GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
   2467       GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
   2468       GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
   2469 }
   2470 
   2471 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
   2472                           blas::UpperLower uplo, uint64 m, uint64 n,
   2473                           std::complex<float> alpha,
   2474                           const DeviceMemory<std::complex<float>> &a, int lda,
   2475                           const DeviceMemory<std::complex<float>> &b, int ldb,
   2476                           std::complex<float> beta,
   2477                           DeviceMemory<std::complex<float>> *c, int ldc) {
   2478   return DoBlasInternal(cublasChemm, stream, true /* = pointer_mode_host */,
   2479                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
   2480                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2481                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   2482                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2483 }
   2484 
   2485 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
   2486                           blas::UpperLower uplo, uint64 m, uint64 n,
   2487                           std::complex<double> alpha,
   2488                           const DeviceMemory<std::complex<double>> &a, int lda,
   2489                           const DeviceMemory<std::complex<double>> &b, int ldb,
   2490                           std::complex<double> beta,
   2491                           DeviceMemory<std::complex<double>> *c, int ldc) {
   2492   return DoBlasInternal(cublasZhemm, stream, true /* = pointer_mode_host */,
   2493                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
   2494                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2495                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   2496                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2497 }
   2498 
   2499 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   2500                           blas::Transpose trans, uint64 n, uint64 k,
   2501                           float alpha,
   2502                           const DeviceMemory<std::complex<float>> &a, int lda,
   2503                           float beta, DeviceMemory<std::complex<float>> *c,
   2504                           int ldc) {
   2505   return DoBlasInternal(cublasCherk, stream, true /* = pointer_mode_host */,
   2506                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2507                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2508                         &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
   2509 }
   2510 
   2511 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   2512                           blas::Transpose trans, uint64 n, uint64 k,
   2513                           double alpha,
   2514                           const DeviceMemory<std::complex<double>> &a, int lda,
   2515                           double beta, DeviceMemory<std::complex<double>> *c,
   2516                           int ldc) {
   2517   return DoBlasInternal(cublasZherk, stream, true /* = pointer_mode_host */,
   2518                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2519                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2520                         &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
   2521 }
   2522 
   2523 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   2524                            blas::Transpose trans, uint64 n, uint64 k,
   2525                            std::complex<float> alpha,
   2526                            const DeviceMemory<std::complex<float>> &a, int lda,
   2527                            const DeviceMemory<std::complex<float>> &b, int ldb,
   2528                            float beta, DeviceMemory<std::complex<float>> *c,
   2529                            int ldc) {
   2530   return DoBlasInternal(cublasCher2k, stream, true /* = pointer_mode_host */,
   2531                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2532                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2533                         GpuComplex(GpuMemory(b)), ldb, &beta,
   2534                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2535 }
   2536 
   2537 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   2538                            blas::Transpose trans, uint64 n, uint64 k,
   2539                            std::complex<double> alpha,
   2540                            const DeviceMemory<std::complex<double>> &a, int lda,
   2541                            const DeviceMemory<std::complex<double>> &b, int ldb,
   2542                            double beta, DeviceMemory<std::complex<double>> *c,
   2543                            int ldc) {
   2544   return DoBlasInternal(cublasZher2k, stream, true /* = pointer_mode_host */,
   2545                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2546                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2547                         GpuComplex(GpuMemory(b)), ldb, &beta,
   2548                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2549 }
   2550 
   2551 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
   2552                           blas::UpperLower uplo, uint64 m, uint64 n,
   2553                           float alpha, const DeviceMemory<float> &a, int lda,
   2554                           const DeviceMemory<float> &b, int ldb, float beta,
   2555                           DeviceMemory<float> *c, int ldc) {
   2556   return DoBlasInternal(cublasSsymm, stream, true /* = pointer_mode_host */,
   2557                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
   2558                         &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
   2559                         GpuMemoryMutable(c), ldc);
   2560 }
   2561 
   2562 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
   2563                           blas::UpperLower uplo, uint64 m, uint64 n,
   2564                           double alpha, const DeviceMemory<double> &a, int lda,
   2565                           const DeviceMemory<double> &b, int ldb, double beta,
   2566                           DeviceMemory<double> *c, int ldc) {
   2567   return DoBlasInternal(cublasDsymm, stream, true /* = pointer_mode_host */,
   2568                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
   2569                         &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
   2570                         GpuMemoryMutable(c), ldc);
   2571 }
   2572 
   2573 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
   2574                           blas::UpperLower uplo, uint64 m, uint64 n,
   2575                           std::complex<float> alpha,
   2576                           const DeviceMemory<std::complex<float>> &a, int lda,
   2577                           const DeviceMemory<std::complex<float>> &b, int ldb,
   2578                           std::complex<float> beta,
   2579                           DeviceMemory<std::complex<float>> *c, int ldc) {
   2580   return DoBlasInternal(cublasCsymm, stream, true /* = pointer_mode_host */,
   2581                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
   2582                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2583                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   2584                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2585 }
   2586 
   2587 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
   2588                           blas::UpperLower uplo, uint64 m, uint64 n,
   2589                           std::complex<double> alpha,
   2590                           const DeviceMemory<std::complex<double>> &a, int lda,
   2591                           const DeviceMemory<std::complex<double>> &b, int ldb,
   2592                           std::complex<double> beta,
   2593                           DeviceMemory<std::complex<double>> *c, int ldc) {
   2594   return DoBlasInternal(cublasZsymm, stream, true /* = pointer_mode_host */,
   2595                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
   2596                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2597                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   2598                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2599 }
   2600 
   2601 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   2602                           blas::Transpose trans, uint64 n, uint64 k,
   2603                           float alpha, const DeviceMemory<float> &a, int lda,
   2604                           float beta, DeviceMemory<float> *c, int ldc) {
   2605   return DoBlasInternal(cublasSsyrk, stream, true /* = pointer_mode_host */,
   2606                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2607                         k, &alpha, GpuMemory(a), lda, &beta,
   2608                         GpuMemoryMutable(c), ldc);
   2609 }
   2610 
   2611 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   2612                           blas::Transpose trans, uint64 n, uint64 k,
   2613                           double alpha, const DeviceMemory<double> &a, int lda,
   2614                           double beta, DeviceMemory<double> *c, int ldc) {
   2615   return DoBlasInternal(cublasDsyrk, stream, true /* = pointer_mode_host */,
   2616                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2617                         k, &alpha, GpuMemory(a), lda, &beta,
   2618                         GpuMemoryMutable(c), ldc);
   2619 }
   2620 
   2621 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   2622                           blas::Transpose trans, uint64 n, uint64 k,
   2623                           std::complex<float> alpha,
   2624                           const DeviceMemory<std::complex<float>> &a, int lda,
   2625                           std::complex<float> beta,
   2626                           DeviceMemory<std::complex<float>> *c, int ldc) {
   2627   return DoBlasInternal(cublasCsyrk, stream, true /* = pointer_mode_host */,
   2628                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2629                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2630                         GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
   2631                         ldc);
   2632 }
   2633 
   2634 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   2635                           blas::Transpose trans, uint64 n, uint64 k,
   2636                           std::complex<double> alpha,
   2637                           const DeviceMemory<std::complex<double>> &a, int lda,
   2638                           std::complex<double> beta,
   2639                           DeviceMemory<std::complex<double>> *c, int ldc) {
   2640   return DoBlasInternal(cublasZsyrk, stream, true /* = pointer_mode_host */,
   2641                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2642                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2643                         GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
   2644                         ldc);
   2645 }
   2646 
   2647 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   2648                            blas::Transpose trans, uint64 n, uint64 k,
   2649                            float alpha, const DeviceMemory<float> &a, int lda,
   2650                            const DeviceMemory<float> &b, int ldb, float beta,
   2651                            DeviceMemory<float> *c, int ldc) {
   2652   return DoBlasInternal(cublasSsyr2k, stream, true /* = pointer_mode_host */,
   2653                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2654                         k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
   2655                         GpuMemoryMutable(c), ldc);
   2656 }
   2657 
   2658 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   2659                            blas::Transpose trans, uint64 n, uint64 k,
   2660                            double alpha, const DeviceMemory<double> &a, int lda,
   2661                            const DeviceMemory<double> &b, int ldb, double beta,
   2662                            DeviceMemory<double> *c, int ldc) {
   2663   return DoBlasInternal(cublasDsyr2k, stream, true /* = pointer_mode_host */,
   2664                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2665                         k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
   2666                         GpuMemoryMutable(c), ldc);
   2667 }
   2668 
   2669 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   2670                            blas::Transpose trans, uint64 n, uint64 k,
   2671                            std::complex<float> alpha,
   2672                            const DeviceMemory<std::complex<float>> &a, int lda,
   2673                            const DeviceMemory<std::complex<float>> &b, int ldb,
   2674                            std::complex<float> beta,
   2675                            DeviceMemory<std::complex<float>> *c, int ldc) {
   2676   return DoBlasInternal(cublasCsyr2k, stream, true /* = pointer_mode_host */,
   2677                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2678                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2679                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   2680                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2681 }
   2682 
   2683 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   2684                            blas::Transpose trans, uint64 n, uint64 k,
   2685                            std::complex<double> alpha,
   2686                            const DeviceMemory<std::complex<double>> &a, int lda,
   2687                            const DeviceMemory<std::complex<double>> &b, int ldb,
   2688                            std::complex<double> beta,
   2689                            DeviceMemory<std::complex<double>> *c, int ldc) {
   2690   return DoBlasInternal(cublasZsyr2k, stream, true /* = pointer_mode_host */,
   2691                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
   2692                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2693                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
   2694                         GpuComplex(GpuMemoryMutable(c)), ldc);
   2695 }
   2696 
   2697 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   2698                           blas::UpperLower uplo, blas::Transpose transa,
   2699                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
   2700                           const DeviceMemory<float> &a, int lda,
   2701                           DeviceMemory<float> *b, int ldb) {
   2702   return DoBlasInternal(cublasStrmm, stream, true /* = pointer_mode_host */,
   2703                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2704                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2705                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
   2706                         GpuMemoryMutable(b), ldb);
   2707 }
   2708 
   2709 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   2710                           blas::UpperLower uplo, blas::Transpose transa,
   2711                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
   2712                           const DeviceMemory<double> &a, int lda,
   2713                           DeviceMemory<double> *b, int ldb) {
   2714   return DoBlasInternal(cublasDtrmm, stream, true /* = pointer_mode_host */,
   2715                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2716                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2717                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
   2718                         GpuMemoryMutable(b), ldb);
   2719 }
   2720 
   2721 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   2722                           blas::UpperLower uplo, blas::Transpose transa,
   2723                           blas::Diagonal diag, uint64 m, uint64 n,
   2724                           std::complex<float> alpha,
   2725                           const DeviceMemory<std::complex<float>> &a, int lda,
   2726                           DeviceMemory<std::complex<float>> *b, int ldb) {
   2727   return DoBlasInternal(cublasCtrmm, stream, true /* = pointer_mode_host */,
   2728                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2729                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2730                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2731                         GpuComplex(GpuMemoryMutable(b)), ldb,
   2732                         GpuComplex(GpuMemoryMutable(b)), ldb);
   2733 }
   2734 
   2735 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   2736                           blas::UpperLower uplo, blas::Transpose transa,
   2737                           blas::Diagonal diag, uint64 m, uint64 n,
   2738                           std::complex<double> alpha,
   2739                           const DeviceMemory<std::complex<double>> &a, int lda,
   2740                           DeviceMemory<std::complex<double>> *b, int ldb) {
   2741   return DoBlasInternal(cublasZtrmm, stream, true /* = pointer_mode_host */,
   2742                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2743                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2744                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2745                         GpuComplex(GpuMemoryMutable(b)), ldb,
   2746                         GpuComplex(GpuMemoryMutable(b)), ldb);
   2747 }
   2748 
   2749 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   2750                           blas::UpperLower uplo, blas::Transpose transa,
   2751                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
   2752                           const DeviceMemory<float> &a, int lda,
   2753                           DeviceMemory<float> *b, int ldb) {
   2754   return DoBlasInternal(cublasStrsm, stream, true /* = pointer_mode_host */,
   2755                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2756                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2757                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
   2758 }
   2759 
   2760 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   2761                           blas::UpperLower uplo, blas::Transpose transa,
   2762                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
   2763                           const DeviceMemory<double> &a, int lda,
   2764                           DeviceMemory<double> *b, int ldb) {
   2765   return DoBlasInternal(cublasDtrsm, stream, true /* = pointer_mode_host */,
   2766                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2767                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2768                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
   2769 }
   2770 
   2771 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   2772                           blas::UpperLower uplo, blas::Transpose transa,
   2773                           blas::Diagonal diag, uint64 m, uint64 n,
   2774                           std::complex<float> alpha,
   2775                           const DeviceMemory<std::complex<float>> &a, int lda,
   2776                           DeviceMemory<std::complex<float>> *b, int ldb) {
   2777   return DoBlasInternal(cublasCtrsm, stream, true /* = pointer_mode_host */,
   2778                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2779                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2780                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2781                         GpuComplex(GpuMemoryMutable(b)), ldb);
   2782 }
   2783 
   2784 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   2785                           blas::UpperLower uplo, blas::Transpose transa,
   2786                           blas::Diagonal diag, uint64 m, uint64 n,
   2787                           std::complex<double> alpha,
   2788                           const DeviceMemory<std::complex<double>> &a, int lda,
   2789                           DeviceMemory<std::complex<double>> *b, int ldb) {
   2790   return DoBlasInternal(cublasZtrsm, stream, true /* = pointer_mode_host */,
   2791                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
   2792                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
   2793                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
   2794                         GpuComplex(GpuMemoryMutable(b)), ldb);
   2795 }
   2796 
   2797 }  // namespace gpu
   2798 
   2799 void initialize_cublas() {
   2800   port::Status status =
   2801       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
   2802           cuda::kCudaPlatformId, gpu::kCuBlasPlugin, "cuBLAS",
   2803           [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
   2804             gpu::GpuExecutor *cuda_executor =
   2805                 dynamic_cast<gpu::GpuExecutor *>(parent);
   2806             if (cuda_executor == nullptr) {
   2807               LOG(ERROR)
   2808                   << "Attempting to initialize an instance of the cuBLAS "
   2809                   << "support library with a non-CUDA StreamExecutor";
   2810               return nullptr;
   2811             }
   2812 
   2813             gpu::CUDABlas *blas = new gpu::CUDABlas(cuda_executor);
   2814             if (!blas->Init()) {
   2815               // Note: Init() will log a more specific error.
   2816               delete blas;
   2817               return nullptr;
   2818             }
   2819             return blas;
   2820           });
   2821 
   2822   if (!status.ok()) {
   2823     LOG(ERROR) << "Unable to register cuBLAS factory: "
   2824                << status.error_message();
   2825   }
   2826 
   2827   PluginRegistry::Instance()->SetDefaultFactory(
   2828       cuda::kCudaPlatformId, PluginKind::kBlas, gpu::kCuBlasPlugin);
   2829 }
   2830 
   2831 }  // namespace stream_executor
   2832 
   2833 REGISTER_MODULE_INITIALIZER(register_cublas,
   2834                             { stream_executor::initialize_cublas(); });
   2835