1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "cuda/include/cublas_v2.h" 17 #include "cuda/include/cuda.h" 18 19 #define SE_CUDA_DATA_HALF CUDA_R_16F 20 21 #include "tensorflow/stream_executor/cuda/cuda_blas.h" 22 23 // Both Eigen Half.h and CUDA cuda_fp16.h provide similar typedef for __half. As 24 // such, there are two ways to get the typedef for __half: 25 // 26 // (1) Includes cuda_fp16.h and defines EIGEN_HAS_CUDA_FP16. 27 // (2) Neither includes cuda_fp16.h nor defines EIGEN_HAS_CUDA_FP16. 28 // 29 // Due to issue b/73793421, when the first approach is used and NVCC is used to 30 // compile this file, NVCC will complain duplicated definition for 31 // EIGEN_HAS_CUDA_FP16. On the other hand, when the second approach is used and 32 // clang is used to compile this file, clang will not understand __half 33 // due to missing the definition and macro EIGEN_HAS_CUDA_FP16. 34 // 35 // Because this file may be compiled with CLANG but will never be compiled with 36 // NVCC, we choose the first approach for CUDA < 9.0. For CUDA >= 9.0, we have 37 // to use the second approach because the data member in the __half defined 38 // by CUDA > 9.0 is `__x` while Eigen expects it to be `x`. 39 // 40 // TODO(b/73793421): Remove the following code block to switch to the second 41 // approach when the issue is fixed. 42 #if CUDA_VERSION < 9000 43 #include "cuda/include/cuda_fp16.h" 44 #define EIGEN_HAS_CUDA_FP16 45 #endif 46 47 #include "third_party/eigen3/Eigen/Core" 48 49 #include <assert.h> 50 #include <complex> 51 52 #include "absl/strings/str_cat.h" 53 #include "tensorflow/core/util/env_var.h" 54 #include "tensorflow/stream_executor/cuda/cuda_activation.h" 55 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" 56 #include "tensorflow/stream_executor/cuda/cuda_helpers.h" 57 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h" 58 #include "tensorflow/stream_executor/cuda/cuda_stream.h" 59 #include "tensorflow/stream_executor/cuda/cuda_timer.h" 60 #include "tensorflow/stream_executor/device_memory.h" 61 #include "tensorflow/stream_executor/lib/env.h" 62 #include "tensorflow/stream_executor/lib/initialize.h" 63 #include "tensorflow/stream_executor/lib/status.h" 64 #include "tensorflow/stream_executor/lib/status_macros.h" 65 #include "tensorflow/stream_executor/lib/stringprintf.h" 66 #include "tensorflow/stream_executor/platform/logging.h" 67 #include "tensorflow/stream_executor/platform/port.h" 68 #include "tensorflow/stream_executor/plugin_registry.h" 69 #include "tensorflow/stream_executor/scratch_allocator.h" 70 #include "tensorflow/stream_executor/stream_executor.h" 71 72 namespace stream_executor { 73 namespace gpu { 74 75 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin); 76 77 static string ToString(cublasStatus_t status) { 78 switch (status) { 79 case CUBLAS_STATUS_SUCCESS: 80 return "CUBLAS_STATUS_SUCCESS"; 81 case CUBLAS_STATUS_NOT_INITIALIZED: 82 return "CUBLAS_STATUS_NOT_INITIALIZED"; 83 case CUBLAS_STATUS_ALLOC_FAILED: 84 return "CUBLAS_STATUS_ALLOC_FAILED"; 85 case CUBLAS_STATUS_INVALID_VALUE: 86 return "CUBLAS_STATUS_INVALID_VALUE"; 87 case CUBLAS_STATUS_ARCH_MISMATCH: 88 return "CUBLAS_STATUS_ARCH_MISMATCH"; 89 case CUBLAS_STATUS_MAPPING_ERROR: 90 return "CUBLAS_STATUS_MAPPING_ERROR"; 91 case CUBLAS_STATUS_EXECUTION_FAILED: 92 return "CUBLAS_STATUS_EXECUTION_FAILED"; 93 case CUBLAS_STATUS_INTERNAL_ERROR: 94 return "CUBLAS_STATUS_INTERNAL_ERROR"; 95 #if CUDA_VERSION >= 8000 96 case CUBLAS_STATUS_NOT_SUPPORTED: 97 return "CUBLAS_STATUS_NOT_SUPPORTED"; 98 case CUBLAS_STATUS_LICENSE_ERROR: 99 return "CUBLAS_STATUS_LICENSE_ERROR"; 100 #endif 101 default: 102 return absl::StrCat("<invalid cublas status: ", status, ">"); 103 } 104 } 105 106 // Decide whether to enable TENSOR_OP_MATH 107 static bool TensorOpMathEnabled() { 108 static bool is_enabled = [] { 109 bool is_disabled; 110 TF_CHECK_OK( 111 tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH", 112 /*default_val=*/false, &is_disabled)); 113 return !is_disabled; 114 }(); 115 return is_enabled; 116 } 117 118 // cuBLAS has interfaces that permit pointers to be passed from either the host 119 // memory space or the device memory space; however, you must instruct it as to 120 // which address space those pointers are in with cublasSetPointerMode. 121 // 122 // This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call 123 // you are about to perform in a given scope. 124 // 125 // The prior cuBLAS pointer mode is retained and restored when this object goes 126 // out of scope. 127 class ScopedCublasPointerMode { 128 public: 129 // Note that, because the setting of the cublas pointer mode is fallible, 130 // construction of this scoped datatype must be paired with a call to 131 // Init(). 132 // 133 // Parameters: 134 // handle: The cublas library handle to act upon in setting the pointer mode. 135 explicit ScopedCublasPointerMode(cublasHandle_t handle) 136 : handle_(handle), ok_(false) {} 137 138 // Attempts the switch to the requested scoped pointer mode, new_mode. 139 // 140 // Note that when false is returned, an appropriate error has already been 141 // logged. 142 bool Init(cublasPointerMode_t new_mode) { 143 cublasStatus_t ret = cublasGetPointerMode(handle_, &old_mode_); 144 if (ret != CUBLAS_STATUS_SUCCESS) { 145 LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret); 146 return ok_ = false; 147 } 148 149 ret = cublasSetPointerMode(handle_, new_mode); 150 if (ret != CUBLAS_STATUS_SUCCESS) { 151 LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret); 152 return ok_ = false; 153 } 154 155 return ok_ = true; 156 } 157 158 // Switches back to the prior pointer mode, if the switch operation was 159 // successful in the first place. 160 ~ScopedCublasPointerMode() { 161 if (ok_) { 162 cublasStatus_t ret = cublasSetPointerMode(handle_, old_mode_); 163 if (ret != CUBLAS_STATUS_SUCCESS) { 164 LOG(ERROR) << "failed to set former cublas pointer mode: " 165 << ToString(ret); 166 } 167 } 168 } 169 170 private: 171 cublasHandle_t handle_; // Handle to the cuBLAS instance of interest. 172 cublasPointerMode_t old_mode_; // Prior cuBLAS pointer mode, to be restored. 173 bool ok_; // Whether the change was successful. 174 }; 175 176 #if CUDA_VERSION >= 9000 177 // cuBLAS has interfaces that permit computations to use the Volta hardware. 178 // This must be enabled via the cublasGet/SetMathMode APIs. 179 // 180 // This helper sets the cuBLAS math mode to a desired value for a cuBLAS call 181 // you are about to perform in a given scope. 182 // 183 // The prior cuBLAS math mode is retained and restored when this object goes 184 // out of scope. 185 class ScopedCublasMathMode { 186 public: 187 // Note that, because the setting of the cublas math mode is fallible, 188 // construction of this scoped datatype must be paired with a call to 189 // Init(). 190 // 191 // Parameters: 192 // handle: The cublas library handle to act upon in setting the math mode. 193 explicit ScopedCublasMathMode(cublasHandle_t handle) 194 : handle_(handle), ok_(false) {} 195 196 // Attempts the switch to the requested scoped math mode, new_mode. 197 // 198 // Note that when false is returned, an appropriate error has already been 199 // logged. 200 bool Init(cublasMath_t new_mode) { 201 cublasStatus_t ret = cublasGetMathMode(handle_, &old_mode_); 202 if (ret != CUBLAS_STATUS_SUCCESS) { 203 LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret); 204 return ok_ = false; 205 } 206 207 ret = cublasSetMathMode(handle_, new_mode); 208 if (ret != CUBLAS_STATUS_SUCCESS) { 209 LOG(ERROR) << "failed to set new cublas math mode: " << ToString(ret); 210 return ok_ = false; 211 } 212 return ok_ = true; 213 } 214 215 // Switches back to the prior math mode, if the switch operation was 216 // successful in the first place. 217 ~ScopedCublasMathMode() { 218 if (ok_) { 219 cublasStatus_t ret = cublasSetMathMode(handle_, old_mode_); 220 if (ret != CUBLAS_STATUS_SUCCESS) { 221 LOG(ERROR) << "failed to set former cublas math mode: " 222 << ToString(ret); 223 } 224 } 225 } 226 227 private: 228 cublasHandle_t handle_; // Handle to the cuBLAS instance of interest. 229 cublasMath_t old_mode_; // Prior cuBLAS math mode, to be restored. 230 bool ok_; // Whether the change was successful. 231 }; 232 #endif // CUDA_VERSION >= 9000 233 234 bool CUDABlas::Init() { 235 gpu::ScopedActivateExecutorContext sac{parent_}; 236 cublasStatus_t ret = cublasCreate(&blas_); 237 if (ret != CUBLAS_STATUS_SUCCESS) { 238 LOG(ERROR) << "failed to create cublas handle: " << ToString(ret); 239 return false; 240 } 241 242 return true; 243 } 244 245 CUDABlas::CUDABlas(gpu::GpuExecutor *parent) 246 : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {} 247 248 CUDABlas::~CUDABlas() { 249 if (blas_ != nullptr) { 250 gpu::ScopedActivateExecutorContext sac{parent_}; 251 cublasDestroy(blas_); 252 } 253 } 254 255 bool CUDABlas::SetStream(Stream *stream) { 256 CHECK(stream != nullptr); 257 CHECK(AsGpuStreamValue(stream) != nullptr); 258 CHECK(blas_ != nullptr); 259 gpu::ScopedActivateExecutorContext sac{parent_}; 260 cublasStatus_t ret = cublasSetStream(blas_, AsGpuStreamValue(stream)); 261 if (ret != CUBLAS_STATUS_SUCCESS) { 262 LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret); 263 return false; 264 } 265 266 return true; 267 } 268 269 namespace { 270 271 // Helper functions transforming blas arguments into cuBLAS arguments. 272 273 cublasOperation_t CUDABlasTranspose(blas::Transpose trans) { 274 switch (trans) { 275 case blas::Transpose::kNoTranspose: 276 return CUBLAS_OP_N; 277 case blas::Transpose::kTranspose: 278 return CUBLAS_OP_T; 279 case blas::Transpose::kConjugateTranspose: 280 return CUBLAS_OP_C; 281 default: 282 LOG(FATAL) << "Invalid value of blas::Transpose."; 283 } 284 } 285 286 cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) { 287 switch (uplo) { 288 case blas::UpperLower::kUpper: 289 return CUBLAS_FILL_MODE_UPPER; 290 case blas::UpperLower::kLower: 291 return CUBLAS_FILL_MODE_LOWER; 292 default: 293 LOG(FATAL) << "Invalid value of blas::UpperLower."; 294 } 295 } 296 297 cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) { 298 switch (diag) { 299 case blas::Diagonal::kUnit: 300 return CUBLAS_DIAG_UNIT; 301 case blas::Diagonal::kNonUnit: 302 return CUBLAS_DIAG_NON_UNIT; 303 default: 304 LOG(FATAL) << "Invalid value of blas::Diagonal."; 305 } 306 } 307 308 cublasSideMode_t CUDABlasSide(blas::Side side) { 309 switch (side) { 310 case blas::Side::kLeft: 311 return CUBLAS_SIDE_LEFT; 312 case blas::Side::kRight: 313 return CUBLAS_SIDE_RIGHT; 314 default: 315 LOG(FATAL) << "Invalid value of blas::Side."; 316 } 317 } 318 319 // CUDADataType<T>::type translates from a C++ type (e.g. float) to a 320 // cudaDataType_t (e.g. CUDA_R_32F). CUDAComputationType(ty) translates from a 321 // blas::ComputationType to a cudaDataType_t. 322 // 323 // These are used to build the argument type and computation type args to 324 // cublasGemmEx. 325 template <typename T> 326 struct CUDADataType; 327 328 template <> 329 struct CUDADataType<Eigen::half> { 330 static constexpr cudaDataType_t type = SE_CUDA_DATA_HALF; 331 }; 332 333 template <> 334 struct CUDADataType<std::complex<Eigen::half>> { 335 static constexpr cudaDataType_t type = CUDA_C_16F; 336 }; 337 338 template <> 339 struct CUDADataType<float> { 340 static constexpr cudaDataType_t type = CUDA_R_32F; 341 }; 342 343 template <> 344 struct CUDADataType<std::complex<float>> { 345 static constexpr cudaDataType_t type = CUDA_C_32F; 346 }; 347 348 template <> 349 struct CUDADataType<double> { 350 static constexpr cudaDataType_t type = CUDA_R_64F; 351 }; 352 353 template <> 354 struct CUDADataType<std::complex<double>> { 355 static constexpr cudaDataType_t type = CUDA_C_64F; 356 }; 357 358 template <> 359 struct CUDADataType<int> { 360 static constexpr cudaDataType_t type = CUDA_R_32I; 361 }; 362 363 template <> 364 struct CUDADataType<int8> { 365 static constexpr cudaDataType_t type = CUDA_R_8I; 366 }; 367 368 template <> 369 struct CUDADataType<std::complex<int8>> { 370 static constexpr cudaDataType_t type = CUDA_C_8I; 371 }; 372 373 template <> 374 struct CUDADataType<uint8> { 375 static constexpr cudaDataType_t type = CUDA_R_8U; 376 }; 377 378 template <> 379 struct CUDADataType<std::complex<uint8>> { 380 static constexpr cudaDataType_t type = CUDA_C_8U; 381 }; 382 383 cudaDataType_t CUDAComputationType(blas::ComputationType ty) { 384 switch (ty) { 385 case blas::ComputationType::kF16: 386 return CUDA_R_16F; 387 case blas::ComputationType::kF32: 388 return CUDA_R_32F; 389 case blas::ComputationType::kF64: 390 return CUDA_R_64F; 391 case blas::ComputationType::kI32: 392 return CUDA_R_32I; 393 case blas::ComputationType::kComplexF32: 394 return CUDA_C_32F; 395 case blas::ComputationType::kComplexF64: 396 return CUDA_C_64F; 397 } 398 } 399 } // namespace 400 401 template <typename FuncT, typename... Args> 402 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream, 403 bool pointer_mode_host, bool err_on_failure, 404 bool use_tensor_op_math, Args... args) { 405 mutex_lock lock(mu_); 406 407 CHECK(blas_ != nullptr); 408 if (!SetStream(stream)) { 409 return false; 410 } 411 412 gpu::ScopedActivateExecutorContext sac{parent_}; 413 ScopedCublasPointerMode pointer_mode{blas_}; 414 if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST 415 : CUBLAS_POINTER_MODE_DEVICE)) { 416 return false; 417 } 418 #if CUDA_VERSION >= 9000 419 ScopedCublasMathMode math_mode{blas_}; 420 if (use_tensor_op_math) { 421 if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) { 422 return false; 423 } 424 } 425 #endif 426 cublasStatus_t ret = cublas_func(blas_, args...); 427 if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) { 428 LOG(ERROR) << "failed to run cuBLAS routine: " << ToString(ret); 429 } 430 return ret == CUBLAS_STATUS_SUCCESS; 431 } 432 433 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, 434 const DeviceMemory<float> &x, int incx, 435 DeviceMemory<float> *result) { 436 return DoBlasInternal(cublasSasum, stream, false /* = pointer_mode_host */, 437 elem_count, GpuMemory(x), incx, 438 GpuMemoryMutable(result)); 439 } 440 441 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, 442 const DeviceMemory<double> &x, int incx, 443 DeviceMemory<double> *result) { 444 return DoBlasInternal(cublasDasum, stream, false /* = pointer_mode_host */, 445 elem_count, GpuMemory(x), incx, 446 GpuMemoryMutable(result)); 447 } 448 449 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, 450 const DeviceMemory<std::complex<float>> &x, int incx, 451 DeviceMemory<float> *result) { 452 return DoBlasInternal(cublasScasum, stream, false /* = pointer_mode_host */, 453 elem_count, GpuComplex(GpuMemory(x)), incx, 454 GpuMemoryMutable(result)); 455 } 456 457 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, 458 const DeviceMemory<std::complex<double>> &x, int incx, 459 DeviceMemory<double> *result) { 460 return DoBlasInternal(cublasDzasum, stream, false /* = pointer_mode_host */, 461 elem_count, GpuComplex(GpuMemory(x)), incx, 462 GpuMemoryMutable(result)); 463 } 464 465 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, 466 const DeviceMemory<float> &x, int incx, 467 DeviceMemory<float> *y, int incy) { 468 return DoBlasInternal(cublasSaxpy, stream, true /* = pointer_mode_host */, 469 elem_count, &alpha, GpuMemory(x), incx, 470 GpuMemoryMutable(y), incy); 471 } 472 473 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, 474 const DeviceMemory<double> &x, int incx, 475 DeviceMemory<double> *y, int incy) { 476 return DoBlasInternal(cublasDaxpy, stream, true /* = pointer_mode_host */, 477 elem_count, &alpha, GpuMemory(x), incx, 478 GpuMemoryMutable(y), incy); 479 } 480 481 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, 482 std::complex<float> alpha, 483 const DeviceMemory<std::complex<float>> &x, int incx, 484 DeviceMemory<std::complex<float>> *y, int incy) { 485 return DoBlasInternal(cublasCaxpy, stream, true /* = pointer_mode_host */, 486 elem_count, GpuComplex(&alpha), 487 GpuComplex(GpuMemory(x)), incx, 488 GpuComplex(GpuMemoryMutable(y)), incy); 489 } 490 491 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, 492 std::complex<double> alpha, 493 const DeviceMemory<std::complex<double>> &x, int incx, 494 DeviceMemory<std::complex<double>> *y, int incy) { 495 return DoBlasInternal(cublasZaxpy, stream, true /* = pointer_mode_host */, 496 elem_count, GpuComplex(&alpha), 497 GpuComplex(GpuMemory(x)), incx, 498 GpuComplex(GpuMemoryMutable(y)), incy); 499 } 500 501 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, 502 const DeviceMemory<float> &x, int incx, 503 DeviceMemory<float> *y, int incy) { 504 return DoBlasInternal(cublasScopy, stream, true /* = pointer_mode_host */, 505 elem_count, GpuMemory(x), incx, GpuMemoryMutable(y), 506 incy); 507 } 508 509 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, 510 const DeviceMemory<double> &x, int incx, 511 DeviceMemory<double> *y, int incy) { 512 return DoBlasInternal(cublasDcopy, stream, true /* = pointer_mode_host */, 513 elem_count, GpuMemory(x), incx, GpuMemoryMutable(y), 514 incy); 515 } 516 517 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, 518 const DeviceMemory<std::complex<float>> &x, int incx, 519 DeviceMemory<std::complex<float>> *y, int incy) { 520 return DoBlasInternal(cublasCcopy, stream, true /* = pointer_mode_host */, 521 elem_count, GpuComplex(GpuMemory(x)), incx, 522 GpuComplex(GpuMemoryMutable(y)), incy); 523 } 524 525 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, 526 const DeviceMemory<std::complex<double>> &x, int incx, 527 DeviceMemory<std::complex<double>> *y, int incy) { 528 return DoBlasInternal(cublasZcopy, stream, true /* = pointer_mode_host */, 529 elem_count, GpuComplex(GpuMemory(x)), incx, 530 GpuComplex(GpuMemoryMutable(y)), incy); 531 } 532 533 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, 534 const DeviceMemory<float> &x, int incx, 535 const DeviceMemory<float> &y, int incy, 536 DeviceMemory<float> *result) { 537 return DoBlasInternal(cublasSdot, stream, false /* = pointer_mode_host */, 538 elem_count, GpuMemory(x), incx, GpuMemory(y), incy, 539 GpuMemoryMutable(result)); 540 } 541 542 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, 543 const DeviceMemory<double> &x, int incx, 544 const DeviceMemory<double> &y, int incy, 545 DeviceMemory<double> *result) { 546 return DoBlasInternal(cublasDdot, stream, false /* = pointer_mode_host */, 547 elem_count, GpuMemory(x), incx, GpuMemory(y), incy, 548 GpuMemoryMutable(result)); 549 } 550 551 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, 552 const DeviceMemory<std::complex<float>> &x, int incx, 553 const DeviceMemory<std::complex<float>> &y, int incy, 554 DeviceMemory<std::complex<float>> *result) { 555 return DoBlasInternal(cublasCdotc, stream, false /* = pointer_mode_host */, 556 elem_count, GpuComplex(GpuMemory(x)), incx, 557 GpuComplex(GpuMemory(y)), incy, 558 GpuComplex(GpuMemoryMutable(result))); 559 } 560 561 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, 562 const DeviceMemory<std::complex<double>> &x, int incx, 563 const DeviceMemory<std::complex<double>> &y, int incy, 564 DeviceMemory<std::complex<double>> *result) { 565 return DoBlasInternal(cublasZdotc, stream, false /* = pointer_mode_host */, 566 elem_count, GpuComplex(GpuMemory(x)), incx, 567 GpuComplex(GpuMemory(y)), incy, 568 GpuComplex(GpuMemoryMutable(result))); 569 } 570 571 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, 572 const DeviceMemory<std::complex<float>> &x, int incx, 573 const DeviceMemory<std::complex<float>> &y, int incy, 574 DeviceMemory<std::complex<float>> *result) { 575 return DoBlasInternal(cublasCdotu, stream, false /* = pointer_mode_host */, 576 elem_count, GpuComplex(GpuMemory(x)), incx, 577 GpuComplex(GpuMemory(y)), incy, 578 GpuComplex(GpuMemoryMutable(result))); 579 } 580 581 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, 582 const DeviceMemory<std::complex<double>> &x, int incx, 583 const DeviceMemory<std::complex<double>> &y, int incy, 584 DeviceMemory<std::complex<double>> *result) { 585 return DoBlasInternal(cublasZdotu, stream, false /* = pointer_mode_host */, 586 elem_count, GpuComplex(GpuMemory(x)), incx, 587 GpuComplex(GpuMemory(y)), incy, 588 GpuComplex(GpuMemoryMutable(result))); 589 } 590 591 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, 592 const DeviceMemory<float> &x, int incx, 593 DeviceMemory<float> *result) { 594 return DoBlasInternal(cublasSnrm2, stream, false /* = pointer_mode_host */, 595 elem_count, GpuMemory(x), incx, 596 GpuMemoryMutable(result)); 597 } 598 599 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, 600 const DeviceMemory<double> &x, int incx, 601 DeviceMemory<double> *result) { 602 return DoBlasInternal(cublasDnrm2, stream, false /* = pointer_mode_host */, 603 elem_count, GpuMemory(x), incx, 604 GpuMemoryMutable(result)); 605 } 606 607 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, 608 const DeviceMemory<std::complex<float>> &x, int incx, 609 DeviceMemory<float> *result) { 610 return DoBlasInternal(cublasScnrm2, stream, false /* = pointer_mode_host */, 611 elem_count, GpuComplex(GpuMemory(x)), incx, 612 GpuMemoryMutable(result)); 613 } 614 615 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, 616 const DeviceMemory<std::complex<double>> &x, int incx, 617 DeviceMemory<double> *result) { 618 return DoBlasInternal(cublasDznrm2, stream, false /* = pointer_mode_host */, 619 elem_count, GpuComplex(GpuMemory(x)), incx, 620 GpuMemoryMutable(result)); 621 } 622 623 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, 624 DeviceMemory<float> *x, int incx, 625 DeviceMemory<float> *y, int incy, float c, float s) { 626 return DoBlasInternal(cublasSrot, stream, true /* = pointer_mode_host */, 627 elem_count, GpuMemoryMutable(x), incx, 628 GpuMemoryMutable(y), incy, &c, &s); 629 } 630 631 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, 632 DeviceMemory<double> *x, int incx, 633 DeviceMemory<double> *y, int incy, double c, 634 double s) { 635 return DoBlasInternal(cublasDrot, stream, true /* = pointer_mode_host */, 636 elem_count, GpuMemoryMutable(x), incx, 637 GpuMemoryMutable(y), incy, &c, &s); 638 } 639 640 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, 641 DeviceMemory<std::complex<float>> *x, int incx, 642 DeviceMemory<std::complex<float>> *y, int incy, 643 float c, float s) { 644 return DoBlasInternal(cublasCsrot, stream, true /* = pointer_mode_host */, 645 elem_count, GpuComplex(GpuMemoryMutable(x)), incx, 646 GpuComplex(GpuMemoryMutable(y)), incy, &c, &s); 647 } 648 649 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, 650 DeviceMemory<std::complex<double>> *x, int incx, 651 DeviceMemory<std::complex<double>> *y, int incy, 652 double c, double s) { 653 return DoBlasInternal(cublasZdrot, stream, true /* = pointer_mode_host */, 654 elem_count, GpuComplex(GpuMemoryMutable(x)), incx, 655 GpuComplex(GpuMemoryMutable(y)), incy, &c, &s); 656 } 657 658 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a, 659 DeviceMemory<float> *b, DeviceMemory<float> *c, 660 DeviceMemory<float> *s) { 661 return DoBlasInternal(cublasSrotg, stream, false /* = pointer_mode_host */, 662 GpuMemoryMutable(a), GpuMemoryMutable(b), 663 GpuMemoryMutable(c), GpuMemoryMutable(s)); 664 } 665 666 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a, 667 DeviceMemory<double> *b, DeviceMemory<double> *c, 668 DeviceMemory<double> *s) { 669 return DoBlasInternal(cublasDrotg, stream, false /* = pointer_mode_host */, 670 GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b), 671 GpuMemoryMutable(c), GpuMemoryMutable(s)); 672 } 673 674 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, 675 DeviceMemory<std::complex<float>> *b, 676 DeviceMemory<float> *c, 677 DeviceMemory<std::complex<float>> *s) { 678 return DoBlasInternal( 679 cublasCrotg, stream, false /* = pointer_mode_host */, 680 GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)), 681 GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s))); 682 } 683 684 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, 685 DeviceMemory<std::complex<double>> *b, 686 DeviceMemory<double> *c, 687 DeviceMemory<std::complex<double>> *s) { 688 return DoBlasInternal( 689 cublasZrotg, stream, false /* = pointer_mode_host */, 690 GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)), 691 GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s))); 692 } 693 694 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, 695 DeviceMemory<float> *x, int incx, 696 DeviceMemory<float> *y, int incy, 697 const DeviceMemory<float> ¶m) { 698 return DoBlasInternal(cublasSrotm, stream, false /* = pointer_mode_host */, 699 elem_count, GpuMemoryMutable(x), incx, 700 GpuMemoryMutable(y), incy, GpuMemory(param)); 701 } 702 703 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, 704 DeviceMemory<double> *x, int incx, 705 DeviceMemory<double> *y, int incy, 706 const DeviceMemory<double> ¶m) { 707 return DoBlasInternal(cublasDrotm, stream, false /* = pointer_mode_host */, 708 elem_count, GpuMemoryMutable(x), incx, 709 GpuMemoryMutable(y), incy, GpuMemory(param)); 710 } 711 712 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, 713 DeviceMemory<float> *d2, DeviceMemory<float> *x1, 714 const DeviceMemory<float> &y1, 715 DeviceMemory<float> *param) { 716 return DoBlasInternal(cublasSrotmg, stream, false /* = pointer_mode_host */, 717 GpuMemoryMutable(d1), GpuMemoryMutable(d2), 718 GpuMemoryMutable(x1), GpuMemory(y1), 719 GpuMemoryMutable(param)); 720 } 721 722 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, 723 DeviceMemory<double> *d2, DeviceMemory<double> *x1, 724 const DeviceMemory<double> &y1, 725 DeviceMemory<double> *param) { 726 return DoBlasInternal(cublasDrotmg, stream, false /* = pointer_mode_host */, 727 GpuMemoryMutable(d1), GpuMemoryMutable(d2), 728 GpuMemoryMutable(x1), GpuMemory(y1), 729 GpuMemoryMutable(param)); 730 } 731 732 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, 733 DeviceMemory<float> *x, int incx) { 734 return DoBlasInternal(cublasSscal, stream, true /* = pointer_mode_host */, 735 elem_count, &alpha, GpuMemoryMutable(x), incx); 736 } 737 738 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, 739 DeviceMemory<double> *x, int incx) { 740 return DoBlasInternal(cublasDscal, stream, true /* = pointer_mode_host */, 741 elem_count, &alpha, GpuMemoryMutable(x), incx); 742 } 743 744 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, 745 DeviceMemory<std::complex<float>> *x, int incx) { 746 return DoBlasInternal(cublasCsscal, stream, true /* = pointer_mode_host */, 747 elem_count, GpuComplex(&alpha), 748 GpuComplex(GpuMemoryMutable(x)), incx); 749 } 750 751 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, 752 DeviceMemory<std::complex<double>> *x, int incx) { 753 return DoBlasInternal(cublasZdscal, stream, true /* = pointer_mode_host */, 754 elem_count, GpuComplex(&alpha), 755 GpuComplex(GpuMemoryMutable(x)), incx); 756 } 757 758 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, 759 std::complex<float> alpha, 760 DeviceMemory<std::complex<float>> *x, int incx) { 761 return DoBlasInternal(cublasCscal, stream, true /* = pointer_mode_host */, 762 elem_count, GpuComplex(&alpha), 763 GpuComplex(GpuMemoryMutable(x)), incx); 764 } 765 766 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, 767 std::complex<double> alpha, 768 DeviceMemory<std::complex<double>> *x, int incx) { 769 return DoBlasInternal(cublasZscal, stream, true /* = pointer_mode_host */, 770 elem_count, GpuComplex(&alpha), 771 GpuComplex(GpuMemoryMutable(x)), incx); 772 } 773 774 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, 775 DeviceMemory<float> *x, int incx, 776 DeviceMemory<float> *y, int incy) { 777 return DoBlasInternal(cublasSswap, stream, true /* = pointer_mode_host */, 778 elem_count, GpuMemoryMutable(x), incx, 779 GpuMemoryMutable(y), incy); 780 } 781 782 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, 783 DeviceMemory<double> *x, int incx, 784 DeviceMemory<double> *y, int incy) { 785 return DoBlasInternal(cublasDswap, stream, true /* = pointer_mode_host */, 786 elem_count, GpuMemoryMutable(x), incx, 787 GpuMemoryMutable(y), incy); 788 } 789 790 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, 791 DeviceMemory<std::complex<float>> *x, int incx, 792 DeviceMemory<std::complex<float>> *y, int incy) { 793 return DoBlasInternal(cublasCswap, stream, true /* = pointer_mode_host */, 794 elem_count, GpuComplex(GpuMemoryMutable(x)), incx, 795 GpuComplex(GpuMemoryMutable(y)), incy); 796 } 797 798 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, 799 DeviceMemory<std::complex<double>> *x, int incx, 800 DeviceMemory<std::complex<double>> *y, int incy) { 801 return DoBlasInternal(cublasZswap, stream, true /* = pointer_mode_host */, 802 elem_count, GpuComplex(GpuMemoryMutable(x)), incx, 803 GpuComplex(GpuMemoryMutable(y)), incy); 804 } 805 806 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, 807 const DeviceMemory<float> &x, int incx, 808 DeviceMemory<int> *result) { 809 return DoBlasInternal(cublasIsamax, stream, false /* = pointer_mode_host */, 810 elem_count, GpuMemory(x), incx, 811 GpuMemoryMutable(result)); 812 } 813 814 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, 815 const DeviceMemory<double> &x, int incx, 816 DeviceMemory<int> *result) { 817 return DoBlasInternal(cublasIdamax, stream, false /* = pointer_mode_host */, 818 elem_count, GpuMemory(x), incx, 819 GpuMemoryMutable(result)); 820 } 821 822 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, 823 const DeviceMemory<std::complex<float>> &x, int incx, 824 DeviceMemory<int> *result) { 825 return DoBlasInternal(cublasIcamax, stream, false /* = pointer_mode_host */, 826 elem_count, GpuComplex(GpuMemory(x)), incx, 827 GpuMemoryMutable(result)); 828 } 829 830 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, 831 const DeviceMemory<std::complex<double>> &x, 832 int incx, DeviceMemory<int> *result) { 833 return DoBlasInternal(cublasIzamax, stream, false /* = pointer_mode_host */, 834 elem_count, GpuComplex(GpuMemory(x)), incx, 835 GpuMemoryMutable(result)); 836 } 837 838 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, 839 const DeviceMemory<float> &x, int incx, 840 DeviceMemory<int> *result) { 841 return DoBlasInternal(cublasIsamin, stream, false /* = pointer_mode_host */, 842 elem_count, GpuComplex(GpuMemory(x)), incx, 843 GpuMemoryMutable(result)); 844 } 845 846 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, 847 const DeviceMemory<double> &x, int incx, 848 DeviceMemory<int> *result) { 849 return DoBlasInternal(cublasIdamin, stream, false /* = pointer_mode_host */, 850 elem_count, GpuComplex(GpuMemory(x)), incx, 851 GpuMemoryMutable(result)); 852 } 853 854 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, 855 const DeviceMemory<std::complex<float>> &x, int incx, 856 DeviceMemory<int> *result) { 857 return DoBlasInternal(cublasIcamin, stream, false /* = pointer_mode_host */, 858 elem_count, GpuComplex(GpuMemory(x)), incx, 859 GpuMemoryMutable(result)); 860 } 861 862 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, 863 const DeviceMemory<std::complex<double>> &x, 864 int incx, DeviceMemory<int> *result) { 865 return DoBlasInternal(cublasIzamin, stream, false /* = pointer_mode_host */, 866 elem_count, GpuComplex(GpuMemory(x)), incx, 867 GpuMemoryMutable(result)); 868 } 869 870 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 871 uint64 n, uint64 kl, uint64 ku, float alpha, 872 const DeviceMemory<float> &a, int lda, 873 const DeviceMemory<float> &x, int incx, float beta, 874 DeviceMemory<float> *y, int incy) { 875 return DoBlasInternal(cublasSgbmv, stream, true /* = pointer_mode_host */, 876 CUDABlasTranspose(trans), m, n, kl, ku, &alpha, 877 GpuMemory(a), lda, GpuMemory(x), incx, &beta, 878 GpuMemoryMutable(y), incy); 879 } 880 881 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 882 uint64 n, uint64 kl, uint64 ku, double alpha, 883 const DeviceMemory<double> &a, int lda, 884 const DeviceMemory<double> &x, int incx, double beta, 885 DeviceMemory<double> *y, int incy) { 886 return DoBlasInternal(cublasDgbmv, stream, true /* = pointer_mode_host */, 887 CUDABlasTranspose(trans), m, n, kl, ku, &alpha, 888 GpuMemory(a), lda, GpuMemory(x), incx, &beta, 889 GpuMemoryMutable(y), incy); 890 } 891 892 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 893 uint64 n, uint64 kl, uint64 ku, 894 std::complex<float> alpha, 895 const DeviceMemory<std::complex<float>> &a, int lda, 896 const DeviceMemory<std::complex<float>> &x, int incx, 897 std::complex<float> beta, 898 DeviceMemory<std::complex<float>> *y, int incy) { 899 return DoBlasInternal(cublasCgbmv, stream, true /* = pointer_mode_host */, 900 CUDABlasTranspose(trans), m, n, kl, ku, 901 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 902 GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta), 903 GpuComplex(GpuMemoryMutable(y)), incy); 904 } 905 906 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, 907 uint64 n, uint64 kl, uint64 ku, 908 std::complex<double> alpha, 909 const DeviceMemory<std::complex<double>> &a, int lda, 910 const DeviceMemory<std::complex<double>> &x, int incx, 911 std::complex<double> beta, 912 DeviceMemory<std::complex<double>> *y, int incy) { 913 return DoBlasInternal(cublasZgbmv, stream, true /* = pointer_mode_host */, 914 CUDABlasTranspose(trans), m, n, kl, ku, 915 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 916 GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta), 917 GpuComplex(GpuMemoryMutable(y)), incy); 918 } 919 920 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 921 uint64 n, float alpha, const DeviceMemory<float> &a, 922 int lda, const DeviceMemory<float> &x, int incx, 923 float beta, DeviceMemory<float> *y, int incy) { 924 return DoBlasInternal(cublasSgemv, stream, true /* = pointer_mode_host */, 925 CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a), 926 lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y), 927 incy); 928 } 929 930 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 931 uint64 n, double alpha, const DeviceMemory<double> &a, 932 int lda, const DeviceMemory<double> &x, int incx, 933 double beta, DeviceMemory<double> *y, int incy) { 934 return DoBlasInternal(cublasDgemv, stream, true /* = pointer_mode_host */, 935 CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a), 936 lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y), 937 incy); 938 } 939 940 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 941 uint64 n, std::complex<float> alpha, 942 const DeviceMemory<std::complex<float>> &a, int lda, 943 const DeviceMemory<std::complex<float>> &x, int incx, 944 std::complex<float> beta, 945 DeviceMemory<std::complex<float>> *y, int incy) { 946 return DoBlasInternal(cublasCgemv, stream, true /* = pointer_mode_host */, 947 CUDABlasTranspose(trans), m, n, GpuComplex(&alpha), 948 GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), 949 incx, GpuComplex(&beta), 950 GpuComplex(GpuMemoryMutable(y)), incy); 951 } 952 953 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, 954 uint64 n, std::complex<double> alpha, 955 const DeviceMemory<std::complex<double>> &a, int lda, 956 const DeviceMemory<std::complex<double>> &x, int incx, 957 std::complex<double> beta, 958 DeviceMemory<std::complex<double>> *y, int incy) { 959 return DoBlasInternal(cublasZgemv, stream, true /* = pointer_mode_host */, 960 CUDABlasTranspose(trans), m, n, GpuComplex(&alpha), 961 GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), 962 incx, GpuComplex(&beta), 963 GpuComplex(GpuMemoryMutable(y)), incy); 964 } 965 966 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, 967 const DeviceMemory<float> &x, int incx, 968 const DeviceMemory<float> &y, int incy, 969 DeviceMemory<float> *a, int lda) { 970 return DoBlasInternal(cublasSger, stream, true /* = pointer_mode_host */, m, 971 n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy, 972 GpuMemoryMutable(a), lda); 973 } 974 975 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, 976 const DeviceMemory<double> &x, int incx, 977 const DeviceMemory<double> &y, int incy, 978 DeviceMemory<double> *a, int lda) { 979 return DoBlasInternal(cublasDger, stream, true /* = pointer_mode_host */, m, 980 n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy, 981 GpuMemoryMutable(a), lda); 982 } 983 984 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, 985 std::complex<float> alpha, 986 const DeviceMemory<std::complex<float>> &x, int incx, 987 const DeviceMemory<std::complex<float>> &y, int incy, 988 DeviceMemory<std::complex<float>> *a, int lda) { 989 return DoBlasInternal(cublasCgerc, stream, true /* = pointer_mode_host */, m, 990 n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx, 991 GpuComplex(GpuMemory(y)), incy, 992 GpuComplex(GpuMemoryMutable(a)), lda); 993 } 994 995 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, 996 std::complex<double> alpha, 997 const DeviceMemory<std::complex<double>> &x, int incx, 998 const DeviceMemory<std::complex<double>> &y, int incy, 999 DeviceMemory<std::complex<double>> *a, int lda) { 1000 return DoBlasInternal(cublasZgerc, stream, true /* = pointer_mode_host */, m, 1001 n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx, 1002 GpuComplex(GpuMemory(y)), incy, 1003 GpuComplex(GpuMemoryMutable(a)), lda); 1004 } 1005 1006 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, 1007 std::complex<float> alpha, 1008 const DeviceMemory<std::complex<float>> &x, int incx, 1009 const DeviceMemory<std::complex<float>> &y, int incy, 1010 DeviceMemory<std::complex<float>> *a, int lda) { 1011 return DoBlasInternal(cublasCgeru, stream, true /* = pointer_mode_host */, m, 1012 n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx, 1013 GpuComplex(GpuMemory(y)), incy, 1014 GpuComplex(GpuMemoryMutable(a)), lda); 1015 } 1016 1017 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, 1018 std::complex<double> alpha, 1019 const DeviceMemory<std::complex<double>> &x, int incx, 1020 const DeviceMemory<std::complex<double>> &y, int incy, 1021 DeviceMemory<std::complex<double>> *a, int lda) { 1022 return DoBlasInternal(cublasZgeru, stream, true /* = pointer_mode_host */, m, 1023 n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx, 1024 GpuComplex(GpuMemory(y)), incy, 1025 GpuComplex(GpuMemoryMutable(a)), lda); 1026 } 1027 1028 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1029 uint64 k, std::complex<float> alpha, 1030 const DeviceMemory<std::complex<float>> &a, int lda, 1031 const DeviceMemory<std::complex<float>> &x, int incx, 1032 std::complex<float> beta, 1033 DeviceMemory<std::complex<float>> *y, int incy) { 1034 return DoBlasInternal(cublasChbmv, stream, true /* = pointer_mode_host */, 1035 CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha), 1036 GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), 1037 incx, GpuComplex(&beta), 1038 GpuComplex(GpuMemoryMutable(y)), incy); 1039 } 1040 1041 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1042 uint64 k, std::complex<double> alpha, 1043 const DeviceMemory<std::complex<double>> &a, int lda, 1044 const DeviceMemory<std::complex<double>> &x, int incx, 1045 std::complex<double> beta, 1046 DeviceMemory<std::complex<double>> *y, int incy) { 1047 return DoBlasInternal(cublasZhbmv, stream, true /* = pointer_mode_host */, 1048 CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha), 1049 GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), 1050 incx, GpuComplex(&beta), 1051 GpuComplex(GpuMemoryMutable(y)), incy); 1052 } 1053 1054 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, 1055 std::complex<float> alpha, 1056 const DeviceMemory<std::complex<float>> &a, int lda, 1057 const DeviceMemory<std::complex<float>> &x, int incx, 1058 std::complex<float> beta, 1059 DeviceMemory<std::complex<float>> *y, int incy) { 1060 return DoBlasInternal(cublasChemv, stream, true /* = pointer_mode_host */, 1061 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1062 GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), 1063 incx, GpuComplex(&beta), 1064 GpuComplex(GpuMemoryMutable(y)), incy); 1065 } 1066 1067 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, 1068 std::complex<double> alpha, 1069 const DeviceMemory<std::complex<double>> &a, int lda, 1070 const DeviceMemory<std::complex<double>> &x, int incx, 1071 std::complex<double> beta, 1072 DeviceMemory<std::complex<double>> *y, int incy) { 1073 return DoBlasInternal(cublasZhemv, stream, true /* = pointer_mode_host */, 1074 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1075 GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), 1076 incx, GpuComplex(&beta), 1077 GpuComplex(GpuMemoryMutable(y)), incy); 1078 } 1079 1080 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, 1081 float alpha, 1082 const DeviceMemory<std::complex<float>> &x, int incx, 1083 DeviceMemory<std::complex<float>> *a, int lda) { 1084 return DoBlasInternal(cublasCher, stream, true /* = pointer_mode_host */, 1085 CUDABlasUpperLower(uplo), n, &alpha, 1086 GpuComplex(GpuMemory(x)), incx, 1087 GpuComplex(GpuMemoryMutable(a)), lda); 1088 } 1089 1090 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, 1091 double alpha, 1092 const DeviceMemory<std::complex<double>> &x, int incx, 1093 DeviceMemory<std::complex<double>> *a, int lda) { 1094 return DoBlasInternal(cublasZher, stream, true /* = pointer_mode_host */, 1095 CUDABlasUpperLower(uplo), n, &alpha, 1096 GpuComplex(GpuMemory(x)), incx, 1097 GpuComplex(GpuMemoryMutable(a)), lda); 1098 } 1099 1100 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, 1101 std::complex<float> alpha, 1102 const DeviceMemory<std::complex<float>> &x, int incx, 1103 const DeviceMemory<std::complex<float>> &y, int incy, 1104 DeviceMemory<std::complex<float>> *a, int lda) { 1105 return DoBlasInternal(cublasCher2, stream, true /* = pointer_mode_host */, 1106 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1107 GpuComplex(GpuMemory(x)), incx, 1108 GpuComplex(GpuMemory(y)), incy, 1109 GpuComplex(GpuMemoryMutable(a)), lda); 1110 } 1111 1112 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, 1113 std::complex<double> alpha, 1114 const DeviceMemory<std::complex<double>> &x, int incx, 1115 const DeviceMemory<std::complex<double>> &y, int incy, 1116 DeviceMemory<std::complex<double>> *a, int lda) { 1117 return DoBlasInternal(cublasZher2, stream, true /* = pointer_mode_host */, 1118 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1119 GpuComplex(GpuMemory(x)), incx, 1120 GpuComplex(GpuMemory(y)), incy, 1121 GpuComplex(GpuMemoryMutable(a)), lda); 1122 } 1123 1124 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1125 std::complex<float> alpha, 1126 const DeviceMemory<std::complex<float>> &ap, 1127 const DeviceMemory<std::complex<float>> &x, int incx, 1128 std::complex<float> beta, 1129 DeviceMemory<std::complex<float>> *y, int incy) { 1130 return DoBlasInternal(cublasChpmv, stream, true /* = pointer_mode_host */, 1131 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1132 GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)), 1133 incx, GpuComplex(&beta), 1134 GpuComplex(GpuMemoryMutable(y)), incy); 1135 } 1136 1137 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1138 std::complex<double> alpha, 1139 const DeviceMemory<std::complex<double>> &ap, 1140 const DeviceMemory<std::complex<double>> &x, int incx, 1141 std::complex<double> beta, 1142 DeviceMemory<std::complex<double>> *y, int incy) { 1143 return DoBlasInternal(cublasZhpmv, stream, true /* = pointer_mode_host */, 1144 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1145 GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)), 1146 incx, GpuComplex(&beta), 1147 GpuComplex(GpuMemoryMutable(y)), incy); 1148 } 1149 1150 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, 1151 float alpha, 1152 const DeviceMemory<std::complex<float>> &x, int incx, 1153 DeviceMemory<std::complex<float>> *ap) { 1154 return DoBlasInternal(cublasChpr, stream, true /* = pointer_mode_host */, 1155 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1156 GpuComplex(GpuMemory(x)), incx, 1157 GpuComplex(GpuMemoryMutable(ap))); 1158 } 1159 1160 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, 1161 double alpha, 1162 const DeviceMemory<std::complex<double>> &x, int incx, 1163 DeviceMemory<std::complex<double>> *ap) { 1164 return DoBlasInternal(cublasZhpr, stream, true /* = pointer_mode_host */, 1165 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), 1166 GpuComplex(GpuMemory(x)), incx, 1167 GpuComplex(GpuMemoryMutable(ap))); 1168 } 1169 1170 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 1171 std::complex<float> alpha, 1172 const DeviceMemory<std::complex<float>> &x, int incx, 1173 const DeviceMemory<std::complex<float>> &y, int incy, 1174 DeviceMemory<std::complex<float>> *ap) { 1175 return DoBlasInternal( 1176 cublasChpr2, stream, true /* = pointer_mode_host */, 1177 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), 1178 incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap))); 1179 } 1180 1181 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 1182 std::complex<double> alpha, 1183 const DeviceMemory<std::complex<double>> &x, int incx, 1184 const DeviceMemory<std::complex<double>> &y, int incy, 1185 DeviceMemory<std::complex<double>> *ap) { 1186 return DoBlasInternal( 1187 cublasZhpr2, stream, true /* = pointer_mode_host */, 1188 CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), 1189 incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap))); 1190 } 1191 1192 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1193 uint64 k, float alpha, const DeviceMemory<float> &a, 1194 int lda, const DeviceMemory<float> &x, int incx, 1195 float beta, DeviceMemory<float> *y, int incy) { 1196 return DoBlasInternal(cublasSsbmv, stream, true /* = pointer_mode_host */, 1197 CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a), 1198 lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y), 1199 incy); 1200 } 1201 1202 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1203 uint64 k, double alpha, const DeviceMemory<double> &a, 1204 int lda, const DeviceMemory<double> &x, int incx, 1205 double beta, DeviceMemory<double> *y, int incy) { 1206 return DoBlasInternal(cublasDsbmv, stream, true /* = pointer_mode_host */, 1207 CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a), 1208 lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y), 1209 incy); 1210 } 1211 1212 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1213 float alpha, const DeviceMemory<float> &ap, 1214 const DeviceMemory<float> &x, int incx, float beta, 1215 DeviceMemory<float> *y, int incy) { 1216 return DoBlasInternal(cublasSspmv, stream, true /* = pointer_mode_host */, 1217 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap), 1218 GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy); 1219 } 1220 1221 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, 1222 double alpha, const DeviceMemory<double> &ap, 1223 const DeviceMemory<double> &x, int incx, double beta, 1224 DeviceMemory<double> *y, int incy) { 1225 return DoBlasInternal(cublasDspmv, stream, true /* = pointer_mode_host */, 1226 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap), 1227 GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy); 1228 } 1229 1230 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, 1231 float alpha, const DeviceMemory<float> &x, int incx, 1232 DeviceMemory<float> *ap) { 1233 return DoBlasInternal(cublasSspr, stream, true /* = pointer_mode_host */, 1234 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1235 GpuMemoryMutable(ap)); 1236 } 1237 1238 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, 1239 double alpha, const DeviceMemory<double> &x, int incx, 1240 DeviceMemory<double> *ap) { 1241 return DoBlasInternal(cublasDspr, stream, true /* = pointer_mode_host */, 1242 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1243 GpuMemoryMutable(ap)); 1244 } 1245 1246 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 1247 float alpha, const DeviceMemory<float> &x, int incx, 1248 const DeviceMemory<float> &y, int incy, 1249 DeviceMemory<float> *ap) { 1250 return DoBlasInternal(cublasSspr2, stream, true /* = pointer_mode_host */, 1251 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1252 GpuMemory(y), incy, GpuMemoryMutable(ap)); 1253 } 1254 1255 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, 1256 double alpha, const DeviceMemory<double> &x, int incx, 1257 const DeviceMemory<double> &y, int incy, 1258 DeviceMemory<double> *ap) { 1259 return DoBlasInternal(cublasDspr2, stream, true /* = pointer_mode_host */, 1260 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1261 GpuMemory(y), incy, GpuMemoryMutable(ap)); 1262 } 1263 1264 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, 1265 float alpha, const DeviceMemory<float> &a, int lda, 1266 const DeviceMemory<float> &x, int incx, float beta, 1267 DeviceMemory<float> *y, int incy) { 1268 return DoBlasInternal(cublasSsymv, stream, true /* = pointer_mode_host */, 1269 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda, 1270 GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy); 1271 } 1272 1273 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, 1274 double alpha, const DeviceMemory<double> &a, int lda, 1275 const DeviceMemory<double> &x, int incx, double beta, 1276 DeviceMemory<double> *y, int incy) { 1277 return DoBlasInternal(cublasDsymv, stream, true /* = pointer_mode_host */, 1278 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda, 1279 GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy); 1280 } 1281 1282 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, 1283 float alpha, const DeviceMemory<float> &x, int incx, 1284 DeviceMemory<float> *a, int lda) { 1285 return DoBlasInternal(cublasSsyr, stream, true /* = pointer_mode_host */, 1286 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1287 GpuMemoryMutable(a), lda); 1288 } 1289 1290 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, 1291 double alpha, const DeviceMemory<double> &x, int incx, 1292 DeviceMemory<double> *a, int lda) { 1293 return DoBlasInternal(cublasDsyr, stream, true /* = pointer_mode_host */, 1294 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1295 GpuMemoryMutable(a), lda); 1296 } 1297 1298 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, 1299 float alpha, const DeviceMemory<float> &x, int incx, 1300 const DeviceMemory<float> &y, int incy, 1301 DeviceMemory<float> *a, int lda) { 1302 return DoBlasInternal(cublasSsyr2, stream, true /* = pointer_mode_host */, 1303 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1304 GpuMemory(y), incy, GpuMemoryMutable(a), lda); 1305 } 1306 1307 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, 1308 double alpha, const DeviceMemory<double> &x, int incx, 1309 const DeviceMemory<double> &y, int incy, 1310 DeviceMemory<double> *a, int lda) { 1311 return DoBlasInternal(cublasDsyr2, stream, true /* = pointer_mode_host */, 1312 CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx, 1313 GpuMemory(y), incy, GpuMemoryMutable(a), lda); 1314 } 1315 1316 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 1317 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1318 uint64 k, const DeviceMemory<float> &a, int lda, 1319 DeviceMemory<float> *x, int incx) { 1320 return DoBlasInternal(cublasStbmv, stream, true /* = pointer_mode_host */, 1321 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1322 CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda, 1323 GpuMemoryMutable(x), incx); 1324 } 1325 1326 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 1327 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1328 uint64 k, const DeviceMemory<double> &a, int lda, 1329 DeviceMemory<double> *x, int incx) { 1330 return DoBlasInternal(cublasDtbmv, stream, true /* = pointer_mode_host */, 1331 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1332 CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda, 1333 GpuMemoryMutable(x), incx); 1334 } 1335 1336 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 1337 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1338 uint64 k, const DeviceMemory<std::complex<float>> &a, 1339 int lda, DeviceMemory<std::complex<float>> *x, 1340 int incx) { 1341 return DoBlasInternal(cublasCtbmv, stream, true /* = pointer_mode_host */, 1342 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1343 CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), 1344 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1345 } 1346 1347 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, 1348 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1349 uint64 k, const DeviceMemory<std::complex<double>> &a, 1350 int lda, DeviceMemory<std::complex<double>> *x, 1351 int incx) { 1352 return DoBlasInternal(cublasZtbmv, stream, true /* = pointer_mode_host */, 1353 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1354 CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), 1355 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1356 } 1357 1358 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 1359 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1360 uint64 k, const DeviceMemory<float> &a, int lda, 1361 DeviceMemory<float> *x, int incx) { 1362 return DoBlasInternal(cublasStbsv, stream, true /* = pointer_mode_host */, 1363 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1364 CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda, 1365 GpuMemoryMutable(x), incx); 1366 } 1367 1368 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 1369 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1370 uint64 k, const DeviceMemory<double> &a, int lda, 1371 DeviceMemory<double> *x, int incx) { 1372 return DoBlasInternal(cublasDtbsv, stream, true /* = pointer_mode_host */, 1373 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1374 CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda, 1375 GpuMemoryMutable(x), incx); 1376 } 1377 1378 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 1379 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1380 uint64 k, const DeviceMemory<std::complex<float>> &a, 1381 int lda, DeviceMemory<std::complex<float>> *x, 1382 int incx) { 1383 return DoBlasInternal(cublasCtbsv, stream, true /* = pointer_mode_host */, 1384 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1385 CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), 1386 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1387 } 1388 1389 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, 1390 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1391 uint64 k, const DeviceMemory<std::complex<double>> &a, 1392 int lda, DeviceMemory<std::complex<double>> *x, 1393 int incx) { 1394 return DoBlasInternal(cublasZtbsv, stream, true /* = pointer_mode_host */, 1395 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1396 CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), 1397 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1398 } 1399 1400 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 1401 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1402 const DeviceMemory<float> &ap, DeviceMemory<float> *x, 1403 int incx) { 1404 return DoBlasInternal(cublasStpmv, stream, true /* = pointer_mode_host */, 1405 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1406 CUDABlasDiagonal(diag), n, GpuMemory(ap), 1407 GpuMemoryMutable(x), incx); 1408 } 1409 1410 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 1411 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1412 const DeviceMemory<double> &ap, 1413 DeviceMemory<double> *x, int incx) { 1414 return DoBlasInternal(cublasDtpmv, stream, true /* = pointer_mode_host */, 1415 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1416 CUDABlasDiagonal(diag), n, GpuMemory(ap), 1417 GpuMemoryMutable(x), incx); 1418 } 1419 1420 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 1421 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1422 const DeviceMemory<std::complex<float>> &ap, 1423 DeviceMemory<std::complex<float>> *x, int incx) { 1424 return DoBlasInternal(cublasCtpmv, stream, true /* = pointer_mode_host */, 1425 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1426 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)), 1427 GpuComplex(GpuMemoryMutable(x)), incx); 1428 } 1429 1430 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, 1431 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1432 const DeviceMemory<std::complex<double>> &ap, 1433 DeviceMemory<std::complex<double>> *x, int incx) { 1434 return DoBlasInternal(cublasZtpmv, stream, true /* = pointer_mode_host */, 1435 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1436 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)), 1437 GpuComplex(GpuMemoryMutable(x)), incx); 1438 } 1439 1440 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 1441 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1442 const DeviceMemory<float> &ap, DeviceMemory<float> *x, 1443 int incx) { 1444 return DoBlasInternal(cublasStpsv, stream, true /* = pointer_mode_host */, 1445 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1446 CUDABlasDiagonal(diag), n, GpuMemory(ap), 1447 GpuMemoryMutable(x), incx); 1448 } 1449 1450 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 1451 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1452 const DeviceMemory<double> &ap, 1453 DeviceMemory<double> *x, int incx) { 1454 return DoBlasInternal(cublasDtpsv, stream, true /* = pointer_mode_host */, 1455 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1456 CUDABlasDiagonal(diag), n, GpuMemory(ap), 1457 GpuMemoryMutable(x), incx); 1458 } 1459 1460 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 1461 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1462 const DeviceMemory<std::complex<float>> &ap, 1463 DeviceMemory<std::complex<float>> *x, int incx) { 1464 return DoBlasInternal(cublasCtpsv, stream, true /* = pointer_mode_host */, 1465 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1466 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)), 1467 GpuComplex(GpuMemoryMutable(x)), incx); 1468 } 1469 1470 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, 1471 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1472 const DeviceMemory<std::complex<double>> &ap, 1473 DeviceMemory<std::complex<double>> *x, int incx) { 1474 return DoBlasInternal(cublasZtpsv, stream, true /* = pointer_mode_host */, 1475 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1476 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)), 1477 GpuComplex(GpuMemoryMutable(x)), incx); 1478 } 1479 1480 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 1481 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1482 const DeviceMemory<float> &a, int lda, 1483 DeviceMemory<float> *x, int incx) { 1484 return DoBlasInternal(cublasStrmv, stream, true /* = pointer_mode_host */, 1485 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1486 CUDABlasDiagonal(diag), n, GpuMemory(a), lda, 1487 GpuMemoryMutable(x), incx); 1488 } 1489 1490 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 1491 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1492 const DeviceMemory<double> &a, int lda, 1493 DeviceMemory<double> *x, int incx) { 1494 return DoBlasInternal(cublasDtrmv, stream, true /* = pointer_mode_host */, 1495 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1496 CUDABlasDiagonal(diag), n, GpuMemory(a), lda, 1497 GpuMemoryMutable(x), incx); 1498 } 1499 1500 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 1501 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1502 const DeviceMemory<std::complex<float>> &a, int lda, 1503 DeviceMemory<std::complex<float>> *x, int incx) { 1504 return DoBlasInternal(cublasCtrmv, stream, true /* = pointer_mode_host */, 1505 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1506 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)), 1507 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1508 } 1509 1510 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, 1511 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1512 const DeviceMemory<std::complex<double>> &a, int lda, 1513 DeviceMemory<std::complex<double>> *x, int incx) { 1514 return DoBlasInternal(cublasZtrmv, stream, true /* = pointer_mode_host */, 1515 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1516 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)), 1517 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1518 } 1519 1520 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 1521 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1522 const DeviceMemory<float> &a, int lda, 1523 DeviceMemory<float> *x, int incx) { 1524 return DoBlasInternal(cublasStrsv, stream, true /* = pointer_mode_host */, 1525 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1526 CUDABlasDiagonal(diag), n, GpuMemory(a), lda, 1527 GpuMemoryMutable(x), incx); 1528 } 1529 1530 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 1531 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1532 const DeviceMemory<double> &a, int lda, 1533 DeviceMemory<double> *x, int incx) { 1534 return DoBlasInternal(cublasDtrsv, stream, true /* = pointer_mode_host */, 1535 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1536 CUDABlasDiagonal(diag), n, GpuMemory(a), lda, 1537 GpuMemoryMutable(x), incx); 1538 } 1539 1540 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 1541 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1542 const DeviceMemory<std::complex<float>> &a, int lda, 1543 DeviceMemory<std::complex<float>> *x, int incx) { 1544 return DoBlasInternal(cublasCtrsv, stream, true /* = pointer_mode_host */, 1545 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1546 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)), 1547 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1548 } 1549 1550 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, 1551 blas::Transpose trans, blas::Diagonal diag, uint64 n, 1552 const DeviceMemory<std::complex<double>> &a, int lda, 1553 DeviceMemory<std::complex<double>> *x, int incx) { 1554 return DoBlasInternal(cublasZtrsv, stream, true /* = pointer_mode_host */, 1555 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), 1556 CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)), 1557 lda, GpuComplex(GpuMemoryMutable(x)), incx); 1558 } 1559 1560 bool CUDABlas::DoBlasGemm( 1561 Stream *stream, blas::Transpose transa, 1562 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 1563 float alpha, const DeviceMemory<Eigen::half> &a, int lda, 1564 const DeviceMemory<Eigen::half> &b, int ldb, float beta, 1565 DeviceMemory<Eigen::half> *c, int ldc) { 1566 #if CUDA_VERSION >= 7050 1567 VLOG(1) << port::Printf( 1568 "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu " 1569 "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f " 1570 "c=%p ldc=%d", 1571 static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha, 1572 a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc); 1573 if (transa == blas::Transpose::kNoTranspose) { 1574 if (lda < static_cast<int64>(m)) { 1575 LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); " 1576 "precondition violation"; 1577 } 1578 } else { 1579 if (lda < static_cast<int64>(k)) { 1580 LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k 1581 << ") (transpose case); precondition violation"; 1582 } 1583 } 1584 if (transb == blas::Transpose::kNoTranspose) { 1585 if (ldb < static_cast<int64>(k)) { 1586 LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k 1587 << ") (no transpose case); precondition violation"; 1588 } 1589 } else { 1590 if (ldb < static_cast<int64>(n)) { 1591 LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); " 1592 "precondition violation"; 1593 } 1594 } 1595 1596 bool use_tensor_ops = false; 1597 #if CUDA_VERSION >= 9000 1598 int cc_major, cc_minor; 1599 stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major, 1600 &cc_minor); 1601 1602 // GPUs < sm_70 don't support tensor ops. 1603 if (cc_major >= 7 && TensorOpMathEnabled()) { 1604 use_tensor_ops = true; 1605 } 1606 #endif 1607 1608 return DoBlasInternalImpl( 1609 cublasSgemmEx, stream, true /* = pointer_mode_host */, 1610 true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa), 1611 CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), 1612 SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta, 1613 GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc); 1614 1615 #else 1616 LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version " 1617 << "(need at least CUDA 7.5)"; 1618 return false; 1619 #endif 1620 } 1621 1622 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, 1623 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 1624 float alpha, const DeviceMemory<float> &a, int lda, 1625 const DeviceMemory<float> &b, int ldb, float beta, 1626 DeviceMemory<float> *c, int ldc) { 1627 VLOG(1) << port::Printf( 1628 "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu " 1629 "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f " 1630 "c=%p ldc=%d", 1631 static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha, 1632 a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc); 1633 if (transa == blas::Transpose::kNoTranspose) { 1634 if (lda < static_cast<int64>(m)) { 1635 LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); " 1636 "precondition violation"; 1637 } 1638 } else { 1639 if (lda < static_cast<int64>(k)) { 1640 LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k 1641 << ") (transpose case); precondition violation"; 1642 } 1643 } 1644 if (transb == blas::Transpose::kNoTranspose) { 1645 if (ldb < static_cast<int64>(k)) { 1646 LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k 1647 << ") (no transpose case); precondition violation"; 1648 } 1649 } else { 1650 if (ldb < static_cast<int64>(n)) { 1651 LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); " 1652 "precondition violation"; 1653 } 1654 } 1655 return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */, 1656 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, 1657 n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, 1658 &beta, GpuMemoryMutable(c), ldc); 1659 } 1660 1661 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, 1662 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 1663 double alpha, const DeviceMemory<double> &a, int lda, 1664 const DeviceMemory<double> &b, int ldb, double beta, 1665 DeviceMemory<double> *c, int ldc) { 1666 return DoBlasInternal(cublasDgemm, stream, true /* = pointer_mode_host */, 1667 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, 1668 n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, 1669 &beta, GpuMemoryMutable(c), ldc); 1670 } 1671 1672 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, 1673 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 1674 std::complex<float> alpha, 1675 const DeviceMemory<std::complex<float>> &a, int lda, 1676 const DeviceMemory<std::complex<float>> &b, int ldb, 1677 std::complex<float> beta, 1678 DeviceMemory<std::complex<float>> *c, int ldc) { 1679 return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */, 1680 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, 1681 n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 1682 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 1683 GpuComplex(GpuMemoryMutable(c)), ldc); 1684 } 1685 1686 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, 1687 blas::Transpose transb, uint64 m, uint64 n, uint64 k, 1688 std::complex<double> alpha, 1689 const DeviceMemory<std::complex<double>> &a, int lda, 1690 const DeviceMemory<std::complex<double>> &b, int ldb, 1691 std::complex<double> beta, 1692 DeviceMemory<std::complex<double>> *c, int ldc) { 1693 return DoBlasInternal(cublasZgemm, stream, true /* = pointer_mode_host */, 1694 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, 1695 n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 1696 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 1697 GpuComplex(GpuMemoryMutable(c)), ldc); 1698 } 1699 1700 bool CUDABlas::DoBlasGemvWithProfiling( 1701 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha, 1702 const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, 1703 int incx, float beta, DeviceMemory<float> *y, int incy, 1704 blas::ProfileResult *output_profile_result) { 1705 return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, 1706 incx, beta, y, incy, 1707 output_profile_result); 1708 } 1709 1710 bool CUDABlas::DoBlasGemvWithProfiling( 1711 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, 1712 const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, 1713 int incx, double beta, DeviceMemory<double> *y, int incy, 1714 blas::ProfileResult *output_profile_result) { 1715 return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, 1716 incx, beta, y, incy, 1717 output_profile_result); 1718 } 1719 1720 bool CUDABlas::DoBlasGemvWithProfiling( 1721 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, 1722 std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a, 1723 int lda, const DeviceMemory<std::complex<float>> &x, int incx, 1724 std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy, 1725 blas::ProfileResult *output_profile_result) { 1726 return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, 1727 incx, beta, y, incy, 1728 output_profile_result); 1729 } 1730 1731 bool CUDABlas::DoBlasGemvWithProfiling( 1732 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, 1733 std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, 1734 int lda, const DeviceMemory<std::complex<double>> &x, int incx, 1735 std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy, 1736 blas::ProfileResult *output_profile_result) { 1737 return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, 1738 incx, beta, y, incy, 1739 output_profile_result); 1740 } 1741 1742 bool CUDABlas::DoBlasGemmWithProfiling( 1743 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1744 uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, 1745 int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta, 1746 DeviceMemory<Eigen::half> *c, int ldc, 1747 blas::ProfileResult *output_profile_result) { 1748 return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, 1749 lda, b, ldb, beta, c, ldc, 1750 output_profile_result); 1751 } 1752 1753 bool CUDABlas::DoBlasGemmWithProfiling( 1754 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1755 uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda, 1756 const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c, 1757 int ldc, blas::ProfileResult *output_profile_result) { 1758 return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, 1759 lda, b, ldb, beta, c, ldc, 1760 output_profile_result); 1761 } 1762 1763 bool CUDABlas::DoBlasGemmWithProfiling( 1764 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1765 uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda, 1766 const DeviceMemory<double> &b, int ldb, double beta, 1767 DeviceMemory<double> *c, int ldc, 1768 blas::ProfileResult *output_profile_result) { 1769 return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, 1770 lda, b, ldb, beta, c, ldc, 1771 output_profile_result); 1772 } 1773 1774 bool CUDABlas::DoBlasGemmWithProfiling( 1775 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1776 uint64 n, uint64 k, std::complex<float> alpha, 1777 const DeviceMemory<std::complex<float>> &a, int lda, 1778 const DeviceMemory<std::complex<float>> &b, int ldb, 1779 std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, 1780 blas::ProfileResult *output_profile_result) { 1781 return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, 1782 lda, b, ldb, beta, c, ldc, 1783 output_profile_result); 1784 } 1785 1786 bool CUDABlas::DoBlasGemmWithProfiling( 1787 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1788 uint64 n, uint64 k, std::complex<double> alpha, 1789 const DeviceMemory<std::complex<double>> &a, int lda, 1790 const DeviceMemory<std::complex<double>> &b, int ldb, 1791 std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, 1792 blas::ProfileResult *output_profile_result) { 1793 return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, 1794 lda, b, ldb, beta, c, ldc, 1795 output_profile_result); 1796 } 1797 1798 template <typename T> 1799 bool CUDABlas::DoBlasGemvWithProfilingImpl( 1800 Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha, 1801 const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx, 1802 const T &beta, DeviceMemory<T> *y, int incy, 1803 blas::ProfileResult *output_profile_result) { 1804 std::unique_ptr<GpuTimer, GpuTimerDeleter> timer; 1805 if (output_profile_result != nullptr) { 1806 timer.reset(new GpuTimer(parent_)); 1807 if (!timer->Init() || !timer->Start(AsGpuStream(stream))) { 1808 return false; 1809 } 1810 } 1811 1812 // Call blasGemm 1813 bool result = 1814 DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); 1815 1816 if (timer != nullptr && result) { 1817 // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error 1818 // state. 1819 if (!timer->Stop(AsGpuStream(stream))) { 1820 return false; 1821 } 1822 output_profile_result->set_is_valid(true); 1823 output_profile_result->set_algorithm(blas::kDefaultBlasGemv); 1824 output_profile_result->set_elapsed_time_in_ms( 1825 timer->GetElapsedMilliseconds()); 1826 } 1827 return result; 1828 } 1829 1830 template <typename T, typename ParamType> 1831 bool CUDABlas::DoBlasGemmWithProfilingImpl( 1832 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1833 uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a, 1834 int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta, 1835 DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) { 1836 std::unique_ptr<GpuTimer, GpuTimerDeleter> timer; 1837 if (output_profile_result != nullptr) { 1838 timer.reset(new GpuTimer(parent_)); 1839 if (!timer->Init() || !timer->Start(AsGpuStream(stream))) { 1840 return false; 1841 } 1842 } 1843 1844 // Call blasGemm 1845 bool result = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b, 1846 ldb, beta, c, ldc); 1847 1848 if (timer != nullptr && result) { 1849 // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error 1850 // state. 1851 if (!timer->Stop(AsGpuStream(stream))) { 1852 return false; 1853 } 1854 output_profile_result->set_is_valid(true); 1855 output_profile_result->set_algorithm(blas::kDefaultBlasGemm); 1856 output_profile_result->set_elapsed_time_in_ms( 1857 timer->GetElapsedMilliseconds()); 1858 } 1859 return result; 1860 } 1861 1862 static bool UsesTensorOps(blas::AlgorithmType algo) { 1863 #if CUDA_VERSION >= 9000 1864 cublasGemmAlgo_t cublas_algo = static_cast<cublasGemmAlgo_t>(algo); 1865 return cublas_algo >= CUBLAS_GEMM_DEFAULT_TENSOR_OP; 1866 #else 1867 return false; 1868 #endif 1869 } 1870 1871 template <typename InType> 1872 static bool TensorOpsAvailable(int cc_major) { 1873 #if CUDA_VERSION >= 9000 1874 // cublas *does* allow tensor ops on inputs that are not fp16, so this is not 1875 // strictly correct. We can't simply enable it, though, as that would change 1876 // clients' behavior significantly: Using tensor ops on fp32 inputs cause them 1877 // to be rounded to fp16. 1878 if (cc_major >= 7 && TensorOpMathEnabled() && 1879 std::is_same<InType, Eigen::half>::value) { 1880 return true; 1881 } 1882 #endif 1883 return false; 1884 } 1885 1886 template <typename InT, typename OutT, typename CompT> 1887 bool CUDABlas::DoBlasGemmWithAlgorithmImpl( 1888 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 1889 uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha, 1890 const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb, 1891 const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc, 1892 blas::ComputationType computation_type, blas::AlgorithmType algorithm, 1893 blas::ProfileResult *output_profile_result) { 1894 // GPUs < sm_50 don't support cublasGemmEx. 1895 int cc_major, cc_minor; 1896 if (stream->parent()->GetDeviceDescription().cuda_compute_capability( 1897 &cc_major, &cc_minor) && 1898 cc_major < 5) { 1899 VLOG(2) << "DoBlasGemmWithAlgorithm returning false because sm" << cc_major 1900 << cc_minor << " devices don't support explicit gemm algorithms."; 1901 return false; 1902 } 1903 1904 if (UsesTensorOps(algorithm) && !TensorOpsAvailable<InT>(cc_major)) { 1905 if (std::is_same<InT, Eigen::half>::value) { 1906 VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm " 1907 << algorithm 1908 << " uses tensor ops, but tensor ops are not available in sm" 1909 << cc_major << "X devices."; 1910 } else { 1911 VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm " 1912 << algorithm 1913 << " uses tensor ops, but the input data type is not fp16."; 1914 } 1915 return false; 1916 } 1917 1918 // Either both 'alpha' and 'beta' need to be pointers to device memory, or 1919 // they need to be both host scalars. 1920 if (alpha.is_pointer() != beta.is_pointer()) { 1921 VLOG(2) << "DoBlasGemmWithAlgorithm returning false because one of `alpha` " 1922 "and `beta` is a pointer, but the other is not."; 1923 return false; 1924 } 1925 1926 std::unique_ptr<GpuTimer, GpuTimerDeleter> timer; 1927 if (output_profile_result != nullptr) { 1928 timer.reset(new GpuTimer(parent_)); 1929 if (!timer->Init() || !timer->Start(AsGpuStream(stream))) { 1930 VLOG(2) << "DoBlasGemmWithAlgorithm returning false because " 1931 "output_profile_result was given, but we were unable to " 1932 "create a GpuTimer."; 1933 return false; 1934 } 1935 } 1936 1937 // Return false if we might be hitting a cuBLAS bug that produces the wrong 1938 // result. See nvbugs/2156201, b/79126339. 1939 #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 1940 if ((algorithm == CUBLAS_GEMM_DEFAULT || algorithm >= CUBLAS_GEMM_ALGO13) && 1941 std::max({m, n, k}) >= 2097153 && cc_major < 7) { 1942 VLOG(2) << "DoBlasGemmWithAlgorithm returning false to work around cudnn " 1943 "<9.2 bug with m, n, or k >= 2097153. See b/79126339."; 1944 return false; 1945 } 1946 #endif 1947 1948 cudaDataType_t cuda_in_type = CUDADataType<InT>::type; 1949 // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast, 1950 // we do the following compile-time check on the default value: 1951 static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, ""); 1952 // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we 1953 // essentially reinterpet_cast to __half, which is safe because Eigen::half 1954 // inherits from __half. 1955 bool result = DoBlasInternalFailureOK( 1956 cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(), 1957 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, 1958 alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(), 1959 GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb, 1960 beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(), 1961 GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc, 1962 CUDAComputationType(computation_type), 1963 static_cast<cublasGemmAlgo_t>(algorithm)); 1964 1965 if (timer != nullptr && result) { 1966 // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error 1967 // state. 1968 if (!timer->Stop(AsGpuStream(stream))) { 1969 VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop " 1970 "GpuTimer."; 1971 return false; 1972 } 1973 output_profile_result->set_is_valid(true); 1974 output_profile_result->set_algorithm(algorithm); 1975 output_profile_result->set_elapsed_time_in_ms( 1976 timer->GetElapsedMilliseconds()); 1977 } 1978 return result; 1979 } 1980 1981 bool CUDABlas::GetBlasGemmAlgorithms( 1982 std::vector<blas::AlgorithmType> *out_algorithms) { 1983 // cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx) 1984 // were first introduced in CUDA 8. 1985 // 1986 // Note that when CUDA version and compute capability is not sufficient, we 1987 // still return the out_algorithms. Caller needs to make sure that in this 1988 // case, the returned vector is empty. 1989 *out_algorithms = { 1990 CUBLAS_GEMM_DFALT, 1991 CUBLAS_GEMM_ALGO0, 1992 CUBLAS_GEMM_ALGO1, 1993 CUBLAS_GEMM_ALGO2, 1994 CUBLAS_GEMM_ALGO3, 1995 CUBLAS_GEMM_ALGO4, 1996 CUBLAS_GEMM_ALGO5, 1997 CUBLAS_GEMM_ALGO6, 1998 CUBLAS_GEMM_ALGO7, 1999 #if CUDA_VERSION >= 9000 2000 CUBLAS_GEMM_ALGO8, 2001 CUBLAS_GEMM_ALGO9, 2002 CUBLAS_GEMM_ALGO10, 2003 CUBLAS_GEMM_ALGO11, 2004 CUBLAS_GEMM_ALGO12, 2005 CUBLAS_GEMM_ALGO13, 2006 CUBLAS_GEMM_ALGO14, 2007 CUBLAS_GEMM_ALGO15, 2008 CUBLAS_GEMM_ALGO16, 2009 CUBLAS_GEMM_ALGO17, 2010 CUBLAS_GEMM_DFALT_TENSOR_OP, 2011 CUBLAS_GEMM_ALGO0_TENSOR_OP, 2012 CUBLAS_GEMM_ALGO1_TENSOR_OP, 2013 CUBLAS_GEMM_ALGO2_TENSOR_OP, 2014 CUBLAS_GEMM_ALGO3_TENSOR_OP, 2015 CUBLAS_GEMM_ALGO4_TENSOR_OP, 2016 #endif 2017 #if CUDA_VERSION >= 9020 2018 CUBLAS_GEMM_ALGO18, 2019 CUBLAS_GEMM_ALGO19, 2020 CUBLAS_GEMM_ALGO20, 2021 CUBLAS_GEMM_ALGO21, 2022 CUBLAS_GEMM_ALGO22, 2023 CUBLAS_GEMM_ALGO23, 2024 CUBLAS_GEMM_ALGO5_TENSOR_OP, 2025 CUBLAS_GEMM_ALGO6_TENSOR_OP, 2026 CUBLAS_GEMM_ALGO7_TENSOR_OP, 2027 CUBLAS_GEMM_ALGO8_TENSOR_OP, 2028 CUBLAS_GEMM_ALGO9_TENSOR_OP, 2029 CUBLAS_GEMM_ALGO10_TENSOR_OP, 2030 CUBLAS_GEMM_ALGO11_TENSOR_OP, 2031 CUBLAS_GEMM_ALGO12_TENSOR_OP, 2032 CUBLAS_GEMM_ALGO13_TENSOR_OP, 2033 CUBLAS_GEMM_ALGO14_TENSOR_OP, 2034 CUBLAS_GEMM_ALGO15_TENSOR_OP, 2035 #endif 2036 }; 2037 return true; 2038 } 2039 2040 bool CUDABlas::DoBlasGemmWithAlgorithm( 2041 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2042 uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha, 2043 const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb, 2044 const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc, 2045 blas::ComputationType computation_type, blas::AlgorithmType algorithm, 2046 blas::ProfileResult *output_profile_result) { 2047 return DoBlasGemmWithAlgorithmImpl( 2048 stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, 2049 computation_type, algorithm, output_profile_result); 2050 } 2051 2052 bool CUDABlas::DoBlasGemmWithAlgorithm( 2053 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2054 uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha, 2055 const DeviceMemory<Eigen::half> &a, int lda, 2056 const DeviceMemory<Eigen::half> &b, int ldb, 2057 const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c, 2058 int ldc, blas::ComputationType computation_type, 2059 blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { 2060 if (computation_type == blas::ComputationType::kF32) { 2061 if (alpha.is_pointer() || beta.is_pointer()) { 2062 // We cannot easily convert a pointer to f16 memory to a pointer to f32 2063 // memory from here, so we don't support this for now. 2064 // TODO(akuegel): Investigate whether we can do the conversion before 2065 // calling DoBlasGemmWithAlgorithm. 2066 return false; 2067 } 2068 HostOrDeviceScalar<float> float_alpha(static_cast<float>(alpha.value())); 2069 HostOrDeviceScalar<float> float_beta(static_cast<float>(beta.value())); 2070 return DoBlasGemmWithAlgorithmImpl( 2071 stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb, 2072 float_beta, c, ldc, computation_type, algorithm, output_profile_result); 2073 } 2074 2075 CHECK_EQ(computation_type, blas::ComputationType::kF16); 2076 return DoBlasGemmWithAlgorithmImpl( 2077 stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, 2078 computation_type, algorithm, output_profile_result); 2079 } 2080 2081 bool CUDABlas::DoBlasGemmWithAlgorithm( 2082 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2083 uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha, 2084 const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b, 2085 int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c, 2086 int ldc, blas::ComputationType computation_type, 2087 blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { 2088 return DoBlasGemmWithAlgorithmImpl( 2089 stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, 2090 computation_type, algorithm, output_profile_result); 2091 } 2092 2093 bool CUDABlas::DoBlasGemmWithAlgorithm( 2094 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2095 uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha, 2096 const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b, 2097 int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c, 2098 int ldc, blas::ComputationType computation_type, 2099 blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { 2100 return DoBlasGemmWithAlgorithmImpl( 2101 stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, 2102 computation_type, algorithm, output_profile_result); 2103 } 2104 2105 bool CUDABlas::DoBlasGemmWithAlgorithm( 2106 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2107 uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha, 2108 const DeviceMemory<std::complex<float>> &a, int lda, 2109 const DeviceMemory<std::complex<float>> &b, int ldb, 2110 const HostOrDeviceScalar<std::complex<float>> &beta, 2111 DeviceMemory<std::complex<float>> *c, int ldc, 2112 blas::ComputationType computation_type, blas::AlgorithmType algorithm, 2113 blas::ProfileResult *output_profile_result) { 2114 return DoBlasGemmWithAlgorithmImpl( 2115 stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, 2116 computation_type, algorithm, output_profile_result); 2117 } 2118 2119 bool CUDABlas::DoBlasGemmWithAlgorithm( 2120 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2121 uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha, 2122 const DeviceMemory<std::complex<double>> &a, int lda, 2123 const DeviceMemory<std::complex<double>> &b, int ldb, 2124 const HostOrDeviceScalar<std::complex<double>> &beta, 2125 DeviceMemory<std::complex<double>> *c, int ldc, 2126 blas::ComputationType computation_type, blas::AlgorithmType algorithm, 2127 blas::ProfileResult *output_profile_result) { 2128 return DoBlasGemmWithAlgorithmImpl( 2129 stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, 2130 computation_type, algorithm, output_profile_result); 2131 } 2132 2133 template <typename T> 2134 struct HalfAsFloat { 2135 typedef T type; 2136 }; 2137 2138 template <> 2139 struct HalfAsFloat<Eigen::half> { 2140 typedef float type; 2141 }; 2142 2143 template <typename T, typename Scalar, typename FuncT> 2144 port::Status CUDABlas::DoBlasGemmBatchedInternal( 2145 FuncT cublas_func, Stream *stream, blas::Transpose transa, 2146 blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha, 2147 const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda, 2148 const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb, 2149 Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers, 2150 int ldc, int batch_count, ScratchAllocator *scratch_allocator) { 2151 std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs; 2152 for (int i = 0; i < batch_count; ++i) { 2153 a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque())); 2154 b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque())); 2155 c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque())); 2156 } 2157 2158 typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T; 2159 2160 const size_t size = batch_count * sizeof(CUDA_T *); 2161 2162 // Device-side copy of pointers to matrices. 2163 DeviceMemory<CUDA_T *> a; 2164 DeviceMemory<CUDA_T *> b; 2165 DeviceMemory<CUDA_T *> c; 2166 2167 // If temporary space is allocated for device-side copies of pointers to 2168 // matrices, that temporary space should not be freed until this function 2169 // returns. Although the values for these unique_ptrs are not set here, they 2170 // are declared at this scope so they will be destroyed when the function 2171 // returns. 2172 // 2173 // If a scratch allocator is provided, these pointers will not be used at all. 2174 std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary; 2175 std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary; 2176 std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary; 2177 2178 // Decide how to allocate device-side copy of pointers to matrices based on 2179 // whether a scratch allocator was passed. 2180 if (scratch_allocator != nullptr) { 2181 SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes, 2182 scratch_allocator->AllocateBytes(stream, size)); 2183 SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes, 2184 scratch_allocator->AllocateBytes(stream, size)); 2185 SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes, 2186 scratch_allocator->AllocateBytes(stream, size)); 2187 a = DeviceMemory<CUDA_T *>(a_bytes); 2188 b = DeviceMemory<CUDA_T *>(b_bytes); 2189 c = DeviceMemory<CUDA_T *>(c_bytes); 2190 } else { 2191 SE_ASSIGN_OR_RETURN(a_temporary, 2192 stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); 2193 SE_ASSIGN_OR_RETURN(b_temporary, 2194 stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); 2195 SE_ASSIGN_OR_RETURN(c_temporary, 2196 stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); 2197 a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory()); 2198 b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory()); 2199 c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory()); 2200 } 2201 2202 if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() || 2203 !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() || 2204 !stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) { 2205 return port::Status(port::error::INTERNAL, 2206 "failed to copy memory from host to device in " 2207 "CUDABlas::DoBlasGemmBatched"); 2208 } 2209 2210 cudaDataType_t data_type = CUDADataType<T>::type; 2211 2212 #if CUDA_VERSION >= 9010 2213 int cc_major, cc_minor; 2214 if (stream->parent()->GetDeviceDescription().cuda_compute_capability( 2215 &cc_major, &cc_minor) && 2216 cc_major >= 5) { 2217 bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F; 2218 cublasGemmAlgo_t algo = 2219 (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT); 2220 cudaDataType_t compute_type = 2221 (data_type == CUDA_R_16F ? CUDA_R_32F : data_type); 2222 const void **a_void_ptrs = reinterpret_cast<const void **>( 2223 const_cast<const CUDA_T **>(GpuMemory(a))); 2224 const void **b_void_ptrs = reinterpret_cast<const void **>( 2225 const_cast<const CUDA_T **>(GpuMemory(b))); 2226 void **c_void_ptrs = 2227 reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c))); 2228 bool ok; 2229 ok = DoBlasInternalImpl( 2230 cublasGemmBatchedEx, stream, true /* = pointer_mode_host */, 2231 true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa), 2232 CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda, 2233 b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc, 2234 batch_count, compute_type, algo); 2235 if (ok) { 2236 return port::Status::OK(); 2237 } 2238 return port::Status(port::error::INTERNAL, 2239 "failed BLAS call, see log for details"); 2240 } 2241 #endif 2242 // either CUDA_VERSION < 9.1 or SM < 5.0 2243 if (data_type != CUDA_R_16F) { 2244 bool ok = DoBlasInternal( 2245 cublas_func, stream, true /* = pointer_mode_host */, 2246 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, 2247 GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda, 2248 const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta), 2249 const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count); 2250 if (ok) { 2251 return port::Status::OK(); 2252 } 2253 return port::Status(port::error::INTERNAL, 2254 "failed BLAS call, see log for details"); 2255 } else { 2256 // Fall back to a loop for fp16 2257 for (int b = 0; b < batch_count; ++b) { 2258 const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b]; 2259 const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b]; 2260 DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b]; 2261 bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix, 2262 lda, b_matrix, ldb, beta, c_matrix, ldc); 2263 if (!ok) { 2264 return port::Status(port::error::INTERNAL, 2265 "failed BLAS call, see log for details"); 2266 } 2267 } 2268 return port::Status::OK(); 2269 } 2270 } 2271 2272 bool CUDABlas::DoBlasGemmBatched( 2273 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2274 uint64 n, uint64 k, float alpha, 2275 const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda, 2276 const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb, 2277 float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array, 2278 int ldc, int batch_count, ScratchAllocator *scratch_allocator) { 2279 // Note: The func passed here (cublasSgemmBatched) is not actually called, 2280 // due to special handling of fp16 inside DoBlasGemmBatchedInternal. 2281 port::Status status = DoBlasGemmBatchedInternal( 2282 cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda, 2283 b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator); 2284 if (!status.ok()) { 2285 LOG(ERROR) << status; 2286 } 2287 return status.ok(); 2288 } 2289 2290 bool CUDABlas::DoBlasGemmBatched( 2291 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2292 uint64 n, uint64 k, float alpha, 2293 const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda, 2294 const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta, 2295 const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc, 2296 int batch_count, ScratchAllocator *scratch_allocator) { 2297 port::Status status = DoBlasGemmBatchedInternal( 2298 cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda, 2299 b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator); 2300 if (!status.ok()) { 2301 LOG(ERROR) << status; 2302 } 2303 return status.ok(); 2304 } 2305 2306 bool CUDABlas::DoBlasGemmBatched( 2307 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2308 uint64 n, uint64 k, double alpha, 2309 const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda, 2310 const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb, 2311 double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array, 2312 int ldc, int batch_count, ScratchAllocator *scratch_allocator) { 2313 port::Status status = DoBlasGemmBatchedInternal( 2314 cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda, 2315 b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator); 2316 if (!status.ok()) { 2317 LOG(ERROR) << status; 2318 } 2319 return status.ok(); 2320 } 2321 2322 bool CUDABlas::DoBlasGemmBatched( 2323 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2324 uint64 n, uint64 k, std::complex<float> alpha, 2325 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array, 2326 int lda, 2327 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array, 2328 int ldb, std::complex<float> beta, 2329 const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array, 2330 int ldc, int batch_count, ScratchAllocator *scratch_allocator) { 2331 port::Status status = DoBlasGemmBatchedInternal( 2332 cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda, 2333 b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator); 2334 if (!status.ok()) { 2335 LOG(ERROR) << status; 2336 } 2337 return status.ok(); 2338 } 2339 2340 bool CUDABlas::DoBlasGemmBatched( 2341 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2342 uint64 n, uint64 k, std::complex<double> alpha, 2343 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array, 2344 int lda, 2345 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array, 2346 int ldb, std::complex<double> beta, 2347 const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array, 2348 int ldc, int batch_count, ScratchAllocator *scratch_allocator) { 2349 port::Status status = DoBlasGemmBatchedInternal( 2350 cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda, 2351 b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator); 2352 if (!status.ok()) { 2353 LOG(ERROR) << status; 2354 } 2355 return status.ok(); 2356 } 2357 2358 bool CUDABlas::DoBlasGemmStridedBatched( 2359 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2360 uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, 2361 int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb, 2362 int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc, 2363 int64 stride_c, int batch_count) { 2364 bool use_tensor_ops = false; 2365 #if CUDA_VERSION >= 9000 2366 int cc_major, cc_minor; 2367 if (stream->parent()->GetDeviceDescription().cuda_compute_capability( 2368 &cc_major, &cc_minor)) { 2369 // GPUs < sm_70 don't support tensor ops. 2370 if (cc_major >= 7 && TensorOpMathEnabled()) { 2371 use_tensor_ops = true; 2372 } 2373 #if CUDA_VERSION >= 9010 2374 if (cc_major >= 5) { 2375 cublasGemmAlgo_t algo = 2376 (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT); 2377 bool ok = DoBlasInternalImpl( 2378 cublasGemmStridedBatchedEx, stream, true /* = pointer_mode_host */, 2379 true /* = err_on_failure */, use_tensor_ops, 2380 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, 2381 GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, 2382 ldb, stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c, 2383 batch_count, CUDA_R_32F, algo); 2384 if (ok) { 2385 return true; 2386 } 2387 LOG(ERROR) << "failed BLAS call, see log for details"; 2388 return false; 2389 } 2390 #endif 2391 } 2392 #endif 2393 // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop. 2394 for (int batch = 0; batch < batch_count; ++batch) { 2395 const auto *a_matrix = 2396 reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a); 2397 const auto *b_matrix = 2398 reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b); 2399 auto *c_matrix = 2400 reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c); 2401 bool ok = DoBlasInternalImpl( 2402 cublasSgemmEx, stream, true /* = pointer_mode_host */, 2403 true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa), 2404 CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF, 2405 lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix, 2406 SE_CUDA_DATA_HALF, ldc); 2407 if (!ok) { 2408 LOG(ERROR) << "failed BLAS call, see log for details"; 2409 return false; 2410 } 2411 } 2412 return true; 2413 } 2414 2415 bool CUDABlas::DoBlasGemmStridedBatched( 2416 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2417 uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda, 2418 int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b, 2419 float beta, DeviceMemory<float> *c, int ldc, int64 stride_c, 2420 int batch_count) { 2421 return DoBlasInternal( 2422 cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */, 2423 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, 2424 GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta, 2425 GpuMemoryMutable(c), ldc, stride_c, batch_count); 2426 } 2427 2428 bool CUDABlas::DoBlasGemmStridedBatched( 2429 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2430 uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda, 2431 int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b, 2432 double beta, DeviceMemory<double> *c, int ldc, int64 stride_c, 2433 int batch_count) { 2434 return DoBlasInternal( 2435 cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */, 2436 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, 2437 GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta, 2438 GpuMemoryMutable(c), ldc, stride_c, batch_count); 2439 } 2440 2441 bool CUDABlas::DoBlasGemmStridedBatched( 2442 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2443 uint64 n, uint64 k, std::complex<float> alpha, 2444 const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a, 2445 const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b, 2446 std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, 2447 int64 stride_c, int batch_count) { 2448 return DoBlasInternal( 2449 cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */, 2450 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, 2451 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a, 2452 GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta), 2453 GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count); 2454 } 2455 2456 bool CUDABlas::DoBlasGemmStridedBatched( 2457 Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, 2458 uint64 n, uint64 k, std::complex<double> alpha, 2459 const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a, 2460 const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b, 2461 std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, 2462 int64 stride_c, int batch_count) { 2463 return DoBlasInternal( 2464 cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */, 2465 CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, 2466 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a, 2467 GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta), 2468 GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count); 2469 } 2470 2471 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, 2472 blas::UpperLower uplo, uint64 m, uint64 n, 2473 std::complex<float> alpha, 2474 const DeviceMemory<std::complex<float>> &a, int lda, 2475 const DeviceMemory<std::complex<float>> &b, int ldb, 2476 std::complex<float> beta, 2477 DeviceMemory<std::complex<float>> *c, int ldc) { 2478 return DoBlasInternal(cublasChemm, stream, true /* = pointer_mode_host */, 2479 CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, 2480 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2481 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 2482 GpuComplex(GpuMemoryMutable(c)), ldc); 2483 } 2484 2485 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, 2486 blas::UpperLower uplo, uint64 m, uint64 n, 2487 std::complex<double> alpha, 2488 const DeviceMemory<std::complex<double>> &a, int lda, 2489 const DeviceMemory<std::complex<double>> &b, int ldb, 2490 std::complex<double> beta, 2491 DeviceMemory<std::complex<double>> *c, int ldc) { 2492 return DoBlasInternal(cublasZhemm, stream, true /* = pointer_mode_host */, 2493 CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, 2494 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2495 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 2496 GpuComplex(GpuMemoryMutable(c)), ldc); 2497 } 2498 2499 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, 2500 blas::Transpose trans, uint64 n, uint64 k, 2501 float alpha, 2502 const DeviceMemory<std::complex<float>> &a, int lda, 2503 float beta, DeviceMemory<std::complex<float>> *c, 2504 int ldc) { 2505 return DoBlasInternal(cublasCherk, stream, true /* = pointer_mode_host */, 2506 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2507 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2508 &beta, GpuComplex(GpuMemoryMutable(c)), ldc); 2509 } 2510 2511 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, 2512 blas::Transpose trans, uint64 n, uint64 k, 2513 double alpha, 2514 const DeviceMemory<std::complex<double>> &a, int lda, 2515 double beta, DeviceMemory<std::complex<double>> *c, 2516 int ldc) { 2517 return DoBlasInternal(cublasZherk, stream, true /* = pointer_mode_host */, 2518 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2519 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2520 &beta, GpuComplex(GpuMemoryMutable(c)), ldc); 2521 } 2522 2523 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, 2524 blas::Transpose trans, uint64 n, uint64 k, 2525 std::complex<float> alpha, 2526 const DeviceMemory<std::complex<float>> &a, int lda, 2527 const DeviceMemory<std::complex<float>> &b, int ldb, 2528 float beta, DeviceMemory<std::complex<float>> *c, 2529 int ldc) { 2530 return DoBlasInternal(cublasCher2k, stream, true /* = pointer_mode_host */, 2531 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2532 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2533 GpuComplex(GpuMemory(b)), ldb, &beta, 2534 GpuComplex(GpuMemoryMutable(c)), ldc); 2535 } 2536 2537 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, 2538 blas::Transpose trans, uint64 n, uint64 k, 2539 std::complex<double> alpha, 2540 const DeviceMemory<std::complex<double>> &a, int lda, 2541 const DeviceMemory<std::complex<double>> &b, int ldb, 2542 double beta, DeviceMemory<std::complex<double>> *c, 2543 int ldc) { 2544 return DoBlasInternal(cublasZher2k, stream, true /* = pointer_mode_host */, 2545 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2546 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2547 GpuComplex(GpuMemory(b)), ldb, &beta, 2548 GpuComplex(GpuMemoryMutable(c)), ldc); 2549 } 2550 2551 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, 2552 blas::UpperLower uplo, uint64 m, uint64 n, 2553 float alpha, const DeviceMemory<float> &a, int lda, 2554 const DeviceMemory<float> &b, int ldb, float beta, 2555 DeviceMemory<float> *c, int ldc) { 2556 return DoBlasInternal(cublasSsymm, stream, true /* = pointer_mode_host */, 2557 CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, 2558 &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta, 2559 GpuMemoryMutable(c), ldc); 2560 } 2561 2562 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, 2563 blas::UpperLower uplo, uint64 m, uint64 n, 2564 double alpha, const DeviceMemory<double> &a, int lda, 2565 const DeviceMemory<double> &b, int ldb, double beta, 2566 DeviceMemory<double> *c, int ldc) { 2567 return DoBlasInternal(cublasDsymm, stream, true /* = pointer_mode_host */, 2568 CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, 2569 &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta, 2570 GpuMemoryMutable(c), ldc); 2571 } 2572 2573 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, 2574 blas::UpperLower uplo, uint64 m, uint64 n, 2575 std::complex<float> alpha, 2576 const DeviceMemory<std::complex<float>> &a, int lda, 2577 const DeviceMemory<std::complex<float>> &b, int ldb, 2578 std::complex<float> beta, 2579 DeviceMemory<std::complex<float>> *c, int ldc) { 2580 return DoBlasInternal(cublasCsymm, stream, true /* = pointer_mode_host */, 2581 CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, 2582 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2583 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 2584 GpuComplex(GpuMemoryMutable(c)), ldc); 2585 } 2586 2587 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, 2588 blas::UpperLower uplo, uint64 m, uint64 n, 2589 std::complex<double> alpha, 2590 const DeviceMemory<std::complex<double>> &a, int lda, 2591 const DeviceMemory<std::complex<double>> &b, int ldb, 2592 std::complex<double> beta, 2593 DeviceMemory<std::complex<double>> *c, int ldc) { 2594 return DoBlasInternal(cublasZsymm, stream, true /* = pointer_mode_host */, 2595 CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, 2596 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2597 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 2598 GpuComplex(GpuMemoryMutable(c)), ldc); 2599 } 2600 2601 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 2602 blas::Transpose trans, uint64 n, uint64 k, 2603 float alpha, const DeviceMemory<float> &a, int lda, 2604 float beta, DeviceMemory<float> *c, int ldc) { 2605 return DoBlasInternal(cublasSsyrk, stream, true /* = pointer_mode_host */, 2606 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2607 k, &alpha, GpuMemory(a), lda, &beta, 2608 GpuMemoryMutable(c), ldc); 2609 } 2610 2611 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 2612 blas::Transpose trans, uint64 n, uint64 k, 2613 double alpha, const DeviceMemory<double> &a, int lda, 2614 double beta, DeviceMemory<double> *c, int ldc) { 2615 return DoBlasInternal(cublasDsyrk, stream, true /* = pointer_mode_host */, 2616 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2617 k, &alpha, GpuMemory(a), lda, &beta, 2618 GpuMemoryMutable(c), ldc); 2619 } 2620 2621 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 2622 blas::Transpose trans, uint64 n, uint64 k, 2623 std::complex<float> alpha, 2624 const DeviceMemory<std::complex<float>> &a, int lda, 2625 std::complex<float> beta, 2626 DeviceMemory<std::complex<float>> *c, int ldc) { 2627 return DoBlasInternal(cublasCsyrk, stream, true /* = pointer_mode_host */, 2628 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2629 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2630 GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), 2631 ldc); 2632 } 2633 2634 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, 2635 blas::Transpose trans, uint64 n, uint64 k, 2636 std::complex<double> alpha, 2637 const DeviceMemory<std::complex<double>> &a, int lda, 2638 std::complex<double> beta, 2639 DeviceMemory<std::complex<double>> *c, int ldc) { 2640 return DoBlasInternal(cublasZsyrk, stream, true /* = pointer_mode_host */, 2641 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2642 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2643 GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), 2644 ldc); 2645 } 2646 2647 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 2648 blas::Transpose trans, uint64 n, uint64 k, 2649 float alpha, const DeviceMemory<float> &a, int lda, 2650 const DeviceMemory<float> &b, int ldb, float beta, 2651 DeviceMemory<float> *c, int ldc) { 2652 return DoBlasInternal(cublasSsyr2k, stream, true /* = pointer_mode_host */, 2653 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2654 k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta, 2655 GpuMemoryMutable(c), ldc); 2656 } 2657 2658 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 2659 blas::Transpose trans, uint64 n, uint64 k, 2660 double alpha, const DeviceMemory<double> &a, int lda, 2661 const DeviceMemory<double> &b, int ldb, double beta, 2662 DeviceMemory<double> *c, int ldc) { 2663 return DoBlasInternal(cublasDsyr2k, stream, true /* = pointer_mode_host */, 2664 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2665 k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta, 2666 GpuMemoryMutable(c), ldc); 2667 } 2668 2669 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 2670 blas::Transpose trans, uint64 n, uint64 k, 2671 std::complex<float> alpha, 2672 const DeviceMemory<std::complex<float>> &a, int lda, 2673 const DeviceMemory<std::complex<float>> &b, int ldb, 2674 std::complex<float> beta, 2675 DeviceMemory<std::complex<float>> *c, int ldc) { 2676 return DoBlasInternal(cublasCsyr2k, stream, true /* = pointer_mode_host */, 2677 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2678 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2679 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 2680 GpuComplex(GpuMemoryMutable(c)), ldc); 2681 } 2682 2683 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, 2684 blas::Transpose trans, uint64 n, uint64 k, 2685 std::complex<double> alpha, 2686 const DeviceMemory<std::complex<double>> &a, int lda, 2687 const DeviceMemory<std::complex<double>> &b, int ldb, 2688 std::complex<double> beta, 2689 DeviceMemory<std::complex<double>> *c, int ldc) { 2690 return DoBlasInternal(cublasZsyr2k, stream, true /* = pointer_mode_host */, 2691 CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, 2692 k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2693 GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta), 2694 GpuComplex(GpuMemoryMutable(c)), ldc); 2695 } 2696 2697 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, 2698 blas::UpperLower uplo, blas::Transpose transa, 2699 blas::Diagonal diag, uint64 m, uint64 n, float alpha, 2700 const DeviceMemory<float> &a, int lda, 2701 DeviceMemory<float> *b, int ldb) { 2702 return DoBlasInternal(cublasStrmm, stream, true /* = pointer_mode_host */, 2703 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2704 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2705 &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb, 2706 GpuMemoryMutable(b), ldb); 2707 } 2708 2709 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, 2710 blas::UpperLower uplo, blas::Transpose transa, 2711 blas::Diagonal diag, uint64 m, uint64 n, double alpha, 2712 const DeviceMemory<double> &a, int lda, 2713 DeviceMemory<double> *b, int ldb) { 2714 return DoBlasInternal(cublasDtrmm, stream, true /* = pointer_mode_host */, 2715 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2716 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2717 &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb, 2718 GpuMemoryMutable(b), ldb); 2719 } 2720 2721 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, 2722 blas::UpperLower uplo, blas::Transpose transa, 2723 blas::Diagonal diag, uint64 m, uint64 n, 2724 std::complex<float> alpha, 2725 const DeviceMemory<std::complex<float>> &a, int lda, 2726 DeviceMemory<std::complex<float>> *b, int ldb) { 2727 return DoBlasInternal(cublasCtrmm, stream, true /* = pointer_mode_host */, 2728 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2729 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2730 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2731 GpuComplex(GpuMemoryMutable(b)), ldb, 2732 GpuComplex(GpuMemoryMutable(b)), ldb); 2733 } 2734 2735 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, 2736 blas::UpperLower uplo, blas::Transpose transa, 2737 blas::Diagonal diag, uint64 m, uint64 n, 2738 std::complex<double> alpha, 2739 const DeviceMemory<std::complex<double>> &a, int lda, 2740 DeviceMemory<std::complex<double>> *b, int ldb) { 2741 return DoBlasInternal(cublasZtrmm, stream, true /* = pointer_mode_host */, 2742 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2743 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2744 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2745 GpuComplex(GpuMemoryMutable(b)), ldb, 2746 GpuComplex(GpuMemoryMutable(b)), ldb); 2747 } 2748 2749 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, 2750 blas::UpperLower uplo, blas::Transpose transa, 2751 blas::Diagonal diag, uint64 m, uint64 n, float alpha, 2752 const DeviceMemory<float> &a, int lda, 2753 DeviceMemory<float> *b, int ldb) { 2754 return DoBlasInternal(cublasStrsm, stream, true /* = pointer_mode_host */, 2755 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2756 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2757 &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb); 2758 } 2759 2760 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, 2761 blas::UpperLower uplo, blas::Transpose transa, 2762 blas::Diagonal diag, uint64 m, uint64 n, double alpha, 2763 const DeviceMemory<double> &a, int lda, 2764 DeviceMemory<double> *b, int ldb) { 2765 return DoBlasInternal(cublasDtrsm, stream, true /* = pointer_mode_host */, 2766 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2767 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2768 &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb); 2769 } 2770 2771 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, 2772 blas::UpperLower uplo, blas::Transpose transa, 2773 blas::Diagonal diag, uint64 m, uint64 n, 2774 std::complex<float> alpha, 2775 const DeviceMemory<std::complex<float>> &a, int lda, 2776 DeviceMemory<std::complex<float>> *b, int ldb) { 2777 return DoBlasInternal(cublasCtrsm, stream, true /* = pointer_mode_host */, 2778 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2779 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2780 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2781 GpuComplex(GpuMemoryMutable(b)), ldb); 2782 } 2783 2784 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, 2785 blas::UpperLower uplo, blas::Transpose transa, 2786 blas::Diagonal diag, uint64 m, uint64 n, 2787 std::complex<double> alpha, 2788 const DeviceMemory<std::complex<double>> &a, int lda, 2789 DeviceMemory<std::complex<double>> *b, int ldb) { 2790 return DoBlasInternal(cublasZtrsm, stream, true /* = pointer_mode_host */, 2791 CUDABlasSide(side), CUDABlasUpperLower(uplo), 2792 CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n, 2793 GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, 2794 GpuComplex(GpuMemoryMutable(b)), ldb); 2795 } 2796 2797 } // namespace gpu 2798 2799 void initialize_cublas() { 2800 port::Status status = 2801 PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>( 2802 cuda::kCudaPlatformId, gpu::kCuBlasPlugin, "cuBLAS", 2803 [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * { 2804 gpu::GpuExecutor *cuda_executor = 2805 dynamic_cast<gpu::GpuExecutor *>(parent); 2806 if (cuda_executor == nullptr) { 2807 LOG(ERROR) 2808 << "Attempting to initialize an instance of the cuBLAS " 2809 << "support library with a non-CUDA StreamExecutor"; 2810 return nullptr; 2811 } 2812 2813 gpu::CUDABlas *blas = new gpu::CUDABlas(cuda_executor); 2814 if (!blas->Init()) { 2815 // Note: Init() will log a more specific error. 2816 delete blas; 2817 return nullptr; 2818 } 2819 return blas; 2820 }); 2821 2822 if (!status.ok()) { 2823 LOG(ERROR) << "Unable to register cuBLAS factory: " 2824 << status.error_message(); 2825 } 2826 2827 PluginRegistry::Instance()->SetDefaultFactory( 2828 cuda::kCudaPlatformId, PluginKind::kBlas, gpu::kCuBlasPlugin); 2829 } 2830 2831 } // namespace stream_executor 2832 2833 REGISTER_MODULE_INITIALIZER(register_cublas, 2834 { stream_executor::initialize_cublas(); }); 2835