core/util/cuda_device_functions.h

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_

/**
 * Wrappers and helpers for CUDA device code.
 *
 * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
 * backwards compatibility, see go/volta-porting for details.
 * Provides atomic operations on types that aren't natively supported.
 */

#if GOOGLE_CUDA

#include <algorithm>
#include <complex>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "cuda/include/cuda.h"
#include "tensorflow/core/platform/types.h"

namespace tensorflow {

namespace detail {

// Helper for range-based for loop using 'delta' increments.
// Usage: see CudaGridRange?() functions below.
template <typename T>
class CudaGridRange {
  struct Iterator {
    __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
    __device__ T operator*() const { return index_; }
    __device__ Iterator& operator++() {
      index_ += delta_;
      return *this;
    }
    __device__ bool operator!=(const Iterator& other) const {
      bool greater = index_ > other.index_;
      bool less = index_ < other.index_;
      // Anything past an end iterator (delta_ == 0) is equal.
      // In range-based for loops, this optimizes to 'return less'.
      if (!other.delta_) {
        return less;
      }
      if (!delta_) {
        return greater;
      }
      return less || greater;
    }

   private:
    T index_;
    const T delta_;
  };

 public:
  __device__ CudaGridRange(T begin, T delta, T end)
      : begin_(begin), delta_(delta), end_(end) {}

  __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
  __device__ Iterator end() const { return Iterator{end_, 0}; }

 private:
  T begin_;
  T delta_;
  T end_;
};

}  // namespace detail

// Helper to visit indices in the range 0 <= i < count, using the x-coordinate
// of the global thread index. That is, each index i is visited by all threads
// with the same x-coordinate.
// Usage: for(int i : CudaGridRangeX(count)) { visit(i); }
template <typename T>
__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) {
  return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
                                  gridDim.x * blockDim.x, count);
}

// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : CudaGridRangeY(count)) { visit(i); }
template <typename T>
__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) {
  return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
                                  gridDim.y * blockDim.y, count);
}

// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); }
template <typename T>
__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) {
  return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
                                  gridDim.z * blockDim.z, count);
}

// Mask for all 32 threads in a warp.
const unsigned kCudaWarpAll = 0xffffffff;

// Returns the warp lane ID of the calling thread
__device__ inline unsigned CudaLaneId() {
  unsigned int lane_id;
  asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
  return lane_id;
}

namespace detail {
// Returns true if mask is a valid parameter for __shfl*sync to return a well
// defined value, assuming the calling lane will read from src_lane as part of
// the shuffle operation.
//
// Specifically, returns true iff mask has the calling lane bit and the src_lane
// bit set, and the src_lane calls this function with the same mask value
// (required for the two threads to wait for each other).
//
// On Volta, for some invalid masks, this function hangs or returns false
// positives, because the implementation shuffles with the same mask that
// we are validating. Run on Pascal if you suspect that the mask is incorrect.
__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
                                                   unsigned src_lane) {
  unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane;
#if CUDA_VERSION >= 9000
  unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
#else
  unsigned src_lane_mask = __shfl(mask, src_lane);
#endif
  return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
}

// Returns the actual source lane for shuffle.
__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) {
  int lane_id = CudaLaneId();
  int lane_base = lane_id & ~width + 1;
  int lane_offset = src_lane & width - 1;
  return lane_base + lane_offset;
}

// Returns the source lane for shuffle up.
__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) {
  unsigned lane_id = CudaLaneId();
  if ((lane_id & width - 1) < delta) {
    return lane_id;
  }
  return lane_id - delta;
}

// Returns the source lane for shuffle down.
__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta,
                                                     int width) {
  unsigned lane_id = CudaLaneId();
  if ((lane_id & width - 1) + delta >= width) {
    return lane_id;
  }
  return lane_id + delta;
}

// Returns the source lane for shuffle xor.
__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
  int lane_id = CudaLaneId();
  int src_lane = lane_id ^ lane_mask;
  if (src_lane > (lane_id | width - 1)) {
    return lane_id;
  }
  return src_lane;
}
}  // namespace detail

// For all *_sync wrappers below, it is illegal to synchronize threads from
// different program locations, because that is not supported before sm_70.
// In other words, all threads in 'mask' must call the functions in convergence.
// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
//
// It is also illegal to shuffle with a mask that produces an undefined result
// for any of the threads. Specifically, all source threads of the shuffle
// must have their corresponding bit in 'mask' set.

// Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) {
  assert(mask & 1u << CudaLaneId());
#if CUDA_VERSION >= 9000
  __syncwarp(mask);
#endif
}

// Wrapper for __ballot_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) {
  assert(mask & 1u << CudaLaneId());
#if CUDA_VERSION >= 9000
  return __ballot_sync(mask, pred);
#else
  return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
#endif
}

// Wrapper for __any_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
__device__ inline int CudaAnySync(unsigned mask, int pred) {
  assert(mask & 1u << CudaLaneId());
#if CUDA_VERSION >= 9000
  return __any_sync(mask, pred);
#else
  return __any(pred);
#endif
}

// Wrapper for __all_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
__device__ inline int CudaAllSync(unsigned mask, int pred) {
  assert(mask & 1u << CudaLaneId());
#if CUDA_VERSION >= 9000
  return __all_sync(mask, pred);
#else
  return __all(pred);
#endif
}

// Wrapper for __shfl_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
template <typename T>
__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
                             int width = warpSize) {
  assert(!(width & width - 1));
  assert(detail::CudaValidateShuffleSyncMask(
      mask, detail::CudaShuffleGetSrcLane(src_lane, width)));
#if CUDA_VERSION >= 9000
  return __shfl_sync(mask, value, src_lane, width);
#else
  return __shfl(value, src_lane, width);
#endif
}

// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// See b/69446944.
__device__ inline double CudaShuffleSync(unsigned mask, double value,
                                         int src_lane, int width = warpSize) {
  unsigned lo, hi;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
  hi = CudaShuffleSync(mask, hi, src_lane, width);
  lo = CudaShuffleSync(mask, lo, src_lane, width);
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
  return value;
}

// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
template <typename T>
__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
                                      int width = warpSize) {
  assert(!(width & width - 1));
  assert(detail::CudaValidateShuffleSyncMask(
      mask, detail::CudaShuffleUpGetSrcLane(delta, width)));
#if CUDA_VERSION >= 9000
  return __shfl_up_sync(mask, value, delta, width);
#else
  return __shfl_up(value, delta, width);
#endif
}

// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// See b/69446944.
__device__ inline double CudaShuffleUpSync(unsigned mask, double value,
                                           unsigned delta,
                                           int width = warpSize) {
  unsigned lo, hi;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
  hi = CudaShuffleUpSync(mask, hi, delta, width);
  lo = CudaShuffleUpSync(mask, lo, delta, width);
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
  return value;
}

// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
// in convergence, see comment above for details.
template <typename T>
__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
                                        int width = warpSize) {
  assert(!(width & width - 1));
  assert(detail::CudaValidateShuffleSyncMask(
      mask, detail::CudaShuffleDownGetSrcLane(delta, width)));
#if CUDA_VERSION >= 9000
  return __shfl_down_sync(mask, value, delta, width);
#else
  return __shfl_down(value, delta, width);
#endif
}

// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// See b/69446944.
__device__ inline double CudaShuffleDownSync(unsigned mask, double value,
                                             unsigned delta,
                                             int width = warpSize) {
  unsigned lo, hi;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
  hi = CudaShuffleDownSync(mask, hi, delta, width);
  lo = CudaShuffleDownSync(mask, lo, delta, width);
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
  return value;
}

// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
template <typename T>
__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
                                int width = warpSize) {
  assert(!(width & width - 1));
  assert(detail::CudaValidateShuffleSyncMask(
      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
#if CUDA_VERSION >= 9000
  return __shfl_xor_sync(mask, value, lane_mask, width);
#else
  return __shfl_xor(value, lane_mask, width);
#endif
}

// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// See b/69446944.
__device__ inline double CudaShuffleXorSync(unsigned mask, double value,
                                            int lane_mask,
                                            int width = warpSize) {
  unsigned lo, hi;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
  hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
  lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
  return value;
}

// Wrapper for __ldg.
template <typename T>
__host__ __device__ T CudaLdg(const T* address) {
#if __CUDA_ARCH__ >= 350
  return __ldg(address);
#else
  return *address;
#endif
}

__host__ __device__ inline bool CudaLdg(const bool* address) {
  return CudaLdg(reinterpret_cast<const char*>(address)) != 0;
}

__host__ __device__ inline std::complex<float> CudaLdg(
    const std::complex<float>* address) {
#if __CUDA_ARCH__ >= 350
  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
  return std::complex<float>(mem.x, mem.y);
#else
  return *address;
#endif
}

__host__ __device__ inline std::complex<double> CudaLdg(
    const std::complex<double>* address) {
#if __CUDA_ARCH__ >= 350
  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
  return std::complex<double>(mem.x, mem.y);
#else
  return *address;
#endif
}

// Zeroes count elements starting at ptr using all threads of a 1-D grid.
// Note: this function does not synchronize, and therefore the memory range is
// not guaranteed to be zero until the next kernel launch.
template <typename T>
__global__ void SetZero(const int count, T* ptr) {
  // Check that the grid is one dimensional and index doesn't overflow.
  assert(blockDim.y == 1 && blockDim.z == 1);
  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
  for (int i : CudaGridRangeX(count)) {
    ptr[i] = T(0);
  }
}

// Helper to set all tensor entries to a specific value.
template <typename T>
__global__ void SetToValue(const int count, T* ptr, T value) {
  // Check that the grid is one dimensional and index doesn't overflow.
  assert(blockDim.y == 1 && blockDim.z == 1);
  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
  for (int i : CudaGridRangeX(count)) {
    ptr[i] = value;
  }
}

namespace detail {
// Helper function for atomic accumulation implemented as CAS.
template <typename T, typename F>
__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
  T old = *ptr;
  T assumed;
  do {
    assumed = old;
    old = atomicCAS(ptr, assumed, accumulate(assumed));
  } while (assumed != old);
  return old;
}

// Overload for floating point (using integer comparison to handle NaN
// correctly).
template <typename F>
__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) {
  return __float_as_int(
      CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
        return __float_as_int(accumulate(__int_as_float(a)));
      }));
}
template <typename F>
__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
  return __longlong_as_double(CudaAtomicCasHelper(
      reinterpret_cast<tensorflow::uint64*>(ptr),
      [accumulate](tensorflow::uint64 a) {
        return __double_as_longlong(accumulate(__longlong_as_double(a)));
      }));
}

// Overload of above function for half. Note that we don't have
// atomicCAS() for anything less than 32 bits, so we need to include the
// other 16 bits in the operation.
//
// This version is going to be very slow
// under high concurrency, since most threads will be spinning on failing
// their compare-and-swap tests. (The fact that we get false sharing on the
// neighboring fp16 makes this even worse.) If you are doing a large reduction,
// you are much better off with doing the intermediate steps in fp32 and then
// switching to fp16 as late as you can in the calculations.
//
// Note: Assumes little endian.
template <typename F>
__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
#endif
  namespace half_impl = Eigen::half_impl;
  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
  assert(!(intptr & 0x1));  // should be 2-aligned.
  if (intptr & 0x2) {
    // The half is in the second part of the uint32 (upper 16 bits).
    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
      unsigned short high = static_cast<unsigned short>(arg >> 16);
      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
    });
    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
  } else {
    // The half is in the first part of the uint32 (lower 16 bits).
    uint32* address = reinterpret_cast<uint32*>(intptr);
    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
    });
    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
  }
}

template <typename From, typename To>
using ToTypeIfConvertible =
    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;

}  // namespace detail

// CUDA provides atomic ops, but not for all types.  We provide wrappers
// for some ops and provide implementation for all reasonable types.

template <typename T, typename U>
__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
  return atomicAdd(ptr, value);
}

__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
                                            Eigen::half value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](Eigen::half a) { return a + value; });
}


#if __CUDA_ARCH__ < 600
__device__ inline double CudaAtomicAdd(double* ptr, double value) {
  return detail::CudaAtomicCasHelper(ptr,
                                     [value](double a) { return a + value; });
}
#elif __clang__
// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX.
// see https://reviews.llvm.org/D39638
__device__ inline double CudaAtomicAdd(double* ptr, double value) {
  double result;
  asm volatile("atom.add.f64 %0, [%1], %2;"
               : "=d"(result)
               : "l"(ptr), "d"(value)
               : "memory");
  return result;
}
#endif
// CudaAtomicAdd
// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
// not support. We treat a std::complex<T>* as a T* (the C++ standard section
// 26.4.4 allows this explicitly) and atomic add the real and imaginary
// components individually. The operation as a whole is not atomic, but we can
// safely treat the components independently for the purpose of accumulating.
__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
                                                    std::complex<float> value) {
  auto ptr_scalar = reinterpret_cast<float*>(ptr);
  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
}

__device__ inline std::complex<double> CudaAtomicAdd(
    std::complex<double>* ptr, std::complex<double> value) {
  auto ptr_scalar = reinterpret_cast<double*>(ptr);
  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
}

// CudaAtomicSub
template <typename T, typename U>
__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
  return atomicSub(ptr, value);
}

// Specializations of substraction which add the negative value.
__device__ inline float CudaAtomicSub(float* ptr, float value) {
  return CudaAtomicAdd(ptr, -value);
}

__device__ inline double CudaAtomicSub(double* ptr, double value) {
  return CudaAtomicAdd(ptr, -value);
}

__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
  return CudaAtomicAdd(ptr, -value);
}

__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
                                            Eigen::half value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](Eigen::half a) { return a - value; });
}

// CudaAtomicMax
template <typename T, typename U>
__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
  return atomicMax(ptr, value);
}

__device__ inline float CudaAtomicMax(float* ptr, float value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](float a) { return max(a, value); });
}

__device__ inline double CudaAtomicMax(double* ptr, double value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](double a) { return max(a, value); });
}

__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
                                            Eigen::half value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](Eigen::half a) { return max(a, value); });
}

#if __CUDA_ARCH__ < 320
__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
}
#endif

// CudaAtomicMin
template <typename T, typename U>
__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
  return atomicMin(ptr, value);
}

__device__ inline float CudaAtomicMin(float* ptr, float value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](float a) { return min(a, value); });
}

__device__ inline double CudaAtomicMin(double* ptr, double value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](double a) { return min(a, value); });
}

__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
                                            Eigen::half value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](Eigen::half a) { return min(a, value); });
}

#if __CUDA_ARCH__ < 320
__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
  return detail::CudaAtomicCasHelper(
      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
}
#endif

// CudaAtomicMul
template <typename T, typename U>
__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
}

// CudaAtomicDiv
template <typename T, typename U>
__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
}

}  // namespace tensorflow

#endif  // GOOGLE_CUDA
#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_