Home | History | Annotate | Download | only in random
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
     17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
     18 
     19 #define _USE_MATH_DEFINES
     20 #include <math.h>
     21 #include <cmath>
     22 #undef _USE_MATH_DEFINES
     23 
     24 #include <string.h>
     25 #include <algorithm>
     26 #include <type_traits>
     27 
     28 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     29 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
     30 #include "tensorflow/core/lib/random/philox_random.h"
     31 
     32 namespace tensorflow {
     33 namespace random {
     34 
     35 // Helper function to convert a 16-bit integer to a half between [0..1).
     36 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
     37 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
     38 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
     39 // Helper function to convert a 32-bit integer to a float between [0..1).
     40 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
     41 // Helper function to convert two 32-bit integers to a double between [0..1).
     42 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
     43 
     44 // Computes a + b. Requires that the result is representable in the destination
     45 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
     46 // need *not* be representable in that type. (The condition on b excludes the
     47 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
     48 // compute.)
     49 template <typename Int>
     50 PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
     51                                    typename std::make_unsigned<Int>::type b) {
     52   // Implementation note: both b_div_2 and b - b_div_2 are positive and
     53   // representatble as Int.
     54   auto b_div_2 = b >> 1;
     55   return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
     56 }
     57 
     58 // A class that generates uniform distribution random numbers from the
     59 // underlying random integer generator.
     60 // Arguments:
     61 //   Generator: a generator type that returns a number of uint32 upon each
     62 //              invocation. It needs to define kResultElementCount for the
     63 //              sample count for each invocation, and ResultType for the
     64 //              actual returned sample type.
     65 //   RealType: the data type of the real numbers that will be returned by the
     66 //             distribution. This could be either float or double for now.
     67 // This class is meant to be implemented through specialization. The default
     68 // is not defined by design.
     69 template <class Generator, typename RealType>
     70 class UniformDistribution;
     71 
     72 template <class Generator>
     73 class UniformDistribution<Generator, Eigen::half> {
     74  public:
     75   // The number of elements that will be returned.
     76   static const int kResultElementCount = Generator::kResultElementCount;
     77   // Cost of generation of a single element (in cycles).
     78   static const int kElementCost = 3;
     79   // Indicate that this distribution may take variable number of samples
     80   // during the runtime.
     81   static const bool kVariableSamplesPerOutput = false;
     82   typedef Array<Eigen::half, kResultElementCount> ResultType;
     83   typedef Eigen::half ResultElementType;
     84 
     85   PHILOX_DEVICE_INLINE
     86   ResultType operator()(Generator* gen) {
     87     typename Generator::ResultType sample = (*gen)();
     88     ResultType result;
     89     for (int i = 0; i < kResultElementCount; ++i) {
     90       result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
     91     }
     92     return result;
     93   }
     94 };
     95 
     96 template <class Generator>
     97 class UniformDistribution<Generator, bfloat16> {
     98  public:
     99   // The number of elements that will be returned.
    100   static const int kResultElementCount = Generator::kResultElementCount;
    101   // Cost of generation of a single element (in cycles).
    102   static const int kElementCost = 3;
    103   // Indicate that this distribution may take variable number of samples
    104   // during the runtime.
    105   static const bool kVariableSamplesPerOutput = false;
    106   typedef Array<bfloat16, kResultElementCount> ResultType;
    107   typedef bfloat16 ResultElementType;
    108 
    109   PHILOX_DEVICE_INLINE
    110   ResultType operator()(Generator* gen) {
    111     typename Generator::ResultType sample = (*gen)();
    112     ResultType result;
    113     for (int i = 0; i < kResultElementCount; ++i) {
    114       result[i] = Uint16ToGfloat16(sample[i]);
    115     }
    116     return result;
    117   }
    118 };
    119 
    120 template <class Generator>
    121 class UniformDistribution<Generator, float> {
    122  public:
    123   // The number of elements that will be returned.
    124   static const int kResultElementCount = Generator::kResultElementCount;
    125   // Cost of generation of a single element (in cycles).
    126   static const int kElementCost = 3;
    127   // Indicate that this distribution may take variable number of samples
    128   // during the runtime.
    129   static const bool kVariableSamplesPerOutput = false;
    130   typedef Array<float, kResultElementCount> ResultType;
    131   typedef float ResultElementType;
    132 
    133   PHILOX_DEVICE_INLINE
    134   ResultType operator()(Generator* gen) {
    135     typename Generator::ResultType sample = (*gen)();
    136     ResultType result;
    137     for (int i = 0; i < kResultElementCount; ++i) {
    138       result[i] = Uint32ToFloat(sample[i]);
    139     }
    140     return result;
    141   }
    142 };
    143 
    144 template <class Generator>
    145 class UniformDistribution<Generator, double> {
    146  public:
    147   // The number of elements that will be returned.
    148   static const int kResultElementCount = Generator::kResultElementCount / 2;
    149   // Cost of generation of a single element (in cycles).
    150   static const int kElementCost = 3;
    151   // Indicate that this distribution may take variable number of samples
    152   // during the runtime.
    153   static const bool kVariableSamplesPerOutput = false;
    154   typedef Array<double, kResultElementCount> ResultType;
    155   typedef double ResultElementType;
    156 
    157   PHILOX_DEVICE_INLINE
    158   ResultType operator()(Generator* gen) {
    159     typename Generator::ResultType sample = (*gen)();
    160     ResultType result;
    161     for (int i = 0; i < kResultElementCount; ++i) {
    162       result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
    163     }
    164     return result;
    165   }
    166 };
    167 
    168 template <class Generator>
    169 class UniformDistribution<Generator, int32> {
    170  public:
    171   // The number of elements that will be returned.
    172   static const int kResultElementCount = Generator::kResultElementCount;
    173   // Cost of generation of a single element (in cycles).
    174   static const int kElementCost = 3;
    175   // Indicate that this distribution may take variable number of samples
    176   // during the runtime.
    177   static const bool kVariableSamplesPerOutput = false;
    178   typedef Array<int32, kResultElementCount> ResultType;
    179   typedef int32 ResultElementType;
    180 
    181   // Must have lo < hi
    182   UniformDistribution(int32 lo, int32 hi)
    183       : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
    184 
    185   PHILOX_DEVICE_INLINE
    186   ResultType operator()(Generator* gen) {
    187     typename Generator::ResultType sample = (*gen)();
    188     ResultType result;
    189     for (int i = 0; i < kResultElementCount; ++i) {
    190       result[i] = SignedAdd(lo_, sample[i] % range_);
    191     }
    192     return result;
    193   }
    194 
    195  private:
    196   // Note that lo_ is intentionally signed while range_ is intentionally
    197   // unsigned.  This is because hi - lo can overflow signed integers if
    198   // lo < 0 < hi, but always fits in unsigned.
    199   int32 lo_;
    200   uint32 range_;
    201 };
    202 
    203 template <class Generator>
    204 class UniformDistribution<Generator, int64> {
    205  public:
    206   // The number of elements that will be returned.
    207   static const int kResultElementCount = Generator::kResultElementCount / 2;
    208   // Cost of generation of a single element (in cycles).
    209   static const int kElementCost = 3;
    210   // Indicate that this distribution may take variable number of samples
    211   // during the runtime.
    212   static const bool kVariableSamplesPerOutput = false;
    213   typedef Array<int64, kResultElementCount> ResultType;
    214   typedef int64 ResultElementType;
    215 
    216   // Must have lo < hi
    217   UniformDistribution(int64 lo, int64 hi)
    218       : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
    219 
    220   PHILOX_DEVICE_INLINE
    221   ResultType operator()(Generator* gen) {
    222     typename Generator::ResultType sample = (*gen)();
    223     ResultType result;
    224     for (int i = 0; i < kResultElementCount; ++i) {
    225       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
    226       result[i] = SignedAdd(lo_, bits % range_);
    227     }
    228     return result;
    229   }
    230 
    231  private:
    232   // Note that lo_ is intentionally signed while range_ is intentionally
    233   // unsigned.  This is because hi - lo can overflow signed integers if
    234   // lo < 0 < hi, but always fits in unsigned.
    235   int64 lo_;
    236   uint64 range_;
    237 };
    238 
    239 // A class that adapts the underlying native multiple samples to return a single
    240 // sample at a time.
    241 template <class Generator>
    242 class SingleSampleAdapter {
    243  public:
    244   // The number of elements that will be returned.
    245   static const int kResultElementCount = 1;
    246   // The number of elements that will be returned by the underlying generator.
    247   static const int kNativeElementCount = Generator::kResultElementCount;
    248   typedef typename Generator::ResultElementType ResultType;
    249   typedef typename Generator::ResultElementType ResultElementType;
    250 
    251   PHILOX_DEVICE_INLINE
    252   explicit SingleSampleAdapter(Generator* gen)
    253       : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
    254 
    255   PHILOX_DEVICE_INLINE
    256   ResultType operator()() {
    257     if (used_result_index_ == Generator::kResultElementCount) {
    258       unused_results_ = (*generator_)();
    259       used_result_index_ = 0;
    260     }
    261 
    262     return unused_results_[used_result_index_++];
    263   }
    264 
    265   PHILOX_DEVICE_INLINE
    266   void Skip(uint64 num_skips) {
    267     if (!num_skips) {
    268       return;
    269     }
    270     int num_unused_results = kNativeElementCount - used_result_index_;
    271     if (num_skips <= num_unused_results) {
    272       used_result_index_ += num_skips;
    273       return;
    274     }
    275     num_skips -= num_unused_results;
    276     used_result_index_ = kNativeElementCount;
    277     SkipFromGenerator(num_skips / kNativeElementCount);
    278     num_skips = num_skips % kNativeElementCount;
    279     if (num_skips) {
    280       unused_results_ = (*generator_)();
    281       used_result_index_ = num_skips;
    282     }
    283   }
    284 
    285  private:
    286   // This implementation iteratively skips over `num_skips` samples
    287   // from `generator_`. There is an O(1) implementation for PhiloxRandom
    288   // in random_distributions.cc.
    289   PHILOX_DEVICE_INLINE
    290   void SkipFromGenerator(uint64 num_skips) {
    291     while (num_skips--) {
    292       (*generator_)();
    293     }
    294   }
    295 
    296   Generator* generator_;
    297   typename Generator::ResultType unused_results_;
    298   int used_result_index_;
    299 };
    300 
    301 // A class that generates unit normal distribution random numbers from the
    302 // underlying random integer generator.
    303 // Arguments:
    304 //   Generator: a generator type that returns a number of uint32 upon each
    305 //              each invocation. It needs to define kResultElementCount for the
    306 //              sample count for each invocation, and ResultType for actual
    307 //              returned sample type.
    308 //   RealType: the data type of the real numbers that will be returned by the
    309 //             distribution. This could be either float or double for now.
    310 // This class is meant to be implemented through specialization. The default
    311 // is not defined by design.
    312 template <class Generator, typename RealType>
    313 class NormalDistribution;
    314 
    315 PHILOX_DEVICE_INLINE
    316 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
    317 
    318 PHILOX_DEVICE_INLINE
    319 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
    320                      double* d1);
    321 
    322 // Exactly like the float version, except that we convert to half afterwards;
    323 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
    324 // gain from working in half internally.
    325 template <class Generator>
    326 class NormalDistribution<Generator, Eigen::half> {
    327  public:
    328   // The number of elements that will be returned.
    329   static const int kResultElementCount = Generator::kResultElementCount;
    330   // Cost of generation of a single element (in cycles).
    331   static const int kElementCost = 70;
    332   // Indicate that this distribution may take variable number of samples
    333   // during the runtime.
    334   static const bool kVariableSamplesPerOutput = false;
    335   typedef Array<Eigen::half, kResultElementCount> ResultType;
    336   typedef Eigen::half ResultElementType;
    337 
    338   PHILOX_DEVICE_INLINE
    339   ResultType operator()(Generator* gen) {
    340     typename Generator::ResultType sample = (*gen)();
    341     ResultType result;
    342     for (int i = 0; i < kResultElementCount; i += 2) {
    343       float f[2];
    344       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
    345       result[i] = Eigen::half(f[0]);
    346       result[i + 1] = Eigen::half(f[1]);
    347     }
    348     return result;
    349   }
    350 };
    351 
    352 template <class Generator>
    353 class NormalDistribution<Generator, bfloat16> {
    354  public:
    355   // The number of elements that will be returned.
    356   static const int kResultElementCount = Generator::kResultElementCount;
    357   // Cost of generation of a single element (in cycles).
    358   static const int kElementCost = 70;
    359   // Indicate that this distribution may take variable number of samples
    360   // during the runtime.
    361   static const bool kVariableSamplesPerOutput = false;
    362   typedef Array<bfloat16, kResultElementCount> ResultType;
    363   typedef bfloat16 ResultElementType;
    364 
    365   PHILOX_DEVICE_INLINE
    366   ResultType operator()(Generator* gen) {
    367     typename Generator::ResultType sample = (*gen)();
    368     ResultType result;
    369     static_assert(kResultElementCount % 2 == 0,
    370                   "kResultElementCount should be an even number");
    371     for (int i = 0; i < kResultElementCount; i += 2) {
    372       float f[2];
    373       // Box-Muller transform requires processing 2 elements at a time.
    374       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
    375       result[i] = bfloat16(f[0]);
    376       result[i + 1] = bfloat16(f[1]);
    377     }
    378     return result;
    379   }
    380 };
    381 
    382 template <class Generator>
    383 class NormalDistribution<Generator, float> {
    384  public:
    385   // The number of elements that will be returned.
    386   static const int kResultElementCount = Generator::kResultElementCount;
    387   // Cost of generation of a single element (in cycles).
    388   static const int kElementCost = 70;
    389   // Indicate that this distribution may take variable number of samples
    390   // during the runtime.
    391   static const bool kVariableSamplesPerOutput = false;
    392   typedef Array<float, kResultElementCount> ResultType;
    393   typedef float ResultElementType;
    394 
    395   PHILOX_DEVICE_INLINE
    396   ResultType operator()(Generator* gen) {
    397     typename Generator::ResultType sample = (*gen)();
    398     ResultType result;
    399     for (int i = 0; i < kResultElementCount; i += 2) {
    400       BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
    401     }
    402     return result;
    403   }
    404 };
    405 
    406 template <class Generator>
    407 class NormalDistribution<Generator, double> {
    408  public:
    409   // The number of elements that will be returned.
    410   static const int kResultElementCount = Generator::kResultElementCount / 2;
    411   // Cost of generation of a single element (in cycles).
    412   static const int kElementCost = 70;
    413   // Indicate that this distribution may take variable number of samples
    414   // during the runtime.
    415   static const bool kVariableSamplesPerOutput = false;
    416   typedef Array<double, kResultElementCount> ResultType;
    417   typedef double ResultElementType;
    418 
    419   PHILOX_DEVICE_INLINE
    420   ResultType operator()(Generator* gen) {
    421     typename Generator::ResultType sample = (*gen)();
    422     ResultType result;
    423     for (int i = 0; i < kResultElementCount; i += 2) {
    424       const int i2 = 2 * i;
    425       BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
    426                       sample[i2 + 3], &result[i], &result[i + 1]);
    427     }
    428     return result;
    429   }
    430 };
    431 
    432 // A class that returns standard normal distribution between
    433 // [-kTruncateValue, kTruncateValue].
    434 // Arguments:
    435 //   Generator: a generator type that returns a number of uint32 upon each
    436 //              each invocation. It needs to define kResultElementCount for the
    437 //              sample count for each invocation, and ResultType for actual
    438 //              returned sample type.
    439 //   RealType: the data type of the real numbers that will be returned by the
    440 //             distribution. This could be either float or double for now.
    441 // This class is meant to be implemented through specialization. The default
    442 // is not defined by design.
    443 template <class SingleSampleGenerator, typename RealType>
    444 class TruncatedNormalDistribution;
    445 
    446 // Exactly like the float version, except that we convert to half afterwards;
    447 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
    448 // gain from working in half internally.
    449 template <class SingleSampleGenerator>
    450 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
    451  public:
    452   // The number of elements that will be returned.
    453   static const int kResultElementCount =
    454       SingleSampleGenerator::kNativeElementCount;
    455   // Cost of generation of a single element (in cycles).
    456   static const int kElementCost = 90;
    457   // Indicate that this distribution may take variable number of samples
    458   // during the runtime.
    459   static const bool kVariableSamplesPerOutput = true;
    460   // The threshold where the normal distribution is truncated.
    461   const float kTruncateValue = 2.0f;
    462 
    463   typedef Array<Eigen::half, kResultElementCount> ResultType;
    464   typedef Eigen::half ResultElementType;
    465 
    466   PHILOX_DEVICE_INLINE
    467   ResultType operator()(SingleSampleGenerator* gen) {
    468     ResultType results;
    469     int index = 0;
    470     while (true) {
    471       // Repeatedly take samples from the normal distribution, until we have
    472       // the desired number of elements that fall within the pre-defined cutoff
    473       // threshold.
    474       const uint32 x0 = (*gen)();
    475       const uint32 x1 = (*gen)();
    476       float f[2];
    477       BoxMullerFloat(x0, x1, &f[0], &f[1]);
    478 
    479       for (int i = 0; i < 2; ++i) {
    480         if (Eigen::numext::abs(f[i]) < kTruncateValue) {
    481           results[index++] = Eigen::half(f[i]);
    482           if (index >= kResultElementCount) {
    483             return results;
    484           }
    485         }
    486       }
    487     }
    488   }
    489 };
    490 
    491 template <class SingleSampleGenerator>
    492 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
    493  public:
    494   // The number of elements that will be returned.
    495   static const int kResultElementCount =
    496       SingleSampleGenerator::kNativeElementCount;
    497   // Cost of generation of a single element (in cycles).
    498   static const int kElementCost = 90;
    499   // Indicate that this distribution may take variable number of samples
    500   // during the runtime.
    501   static const bool kVariableSamplesPerOutput = true;
    502   // The threshold where the normal distribution is truncated.
    503   const float kTruncateValue = 2.0f;
    504 
    505   typedef Array<bfloat16, kResultElementCount> ResultType;
    506   typedef bfloat16 ResultElementType;
    507 
    508   PHILOX_DEVICE_INLINE
    509   ResultType operator()(SingleSampleGenerator* gen) {
    510     ResultType results;
    511     int index = 0;
    512     while (true) {
    513       // Repeatedly take samples from the normal distribution, until we have
    514       // the desired number of elements that fall within the pre-defined cutoff
    515       // threshold.
    516       const uint32 x0 = (*gen)();
    517       const uint32 x1 = (*gen)();
    518       float f[2];
    519       BoxMullerFloat(x0, x1, &f[0], &f[1]);
    520 
    521       for (int i = 0; i < 2; ++i) {
    522         if (Eigen::numext::abs(f[i]) < kTruncateValue) {
    523           results[index++] = bfloat16(f[i]);
    524           if (index >= kResultElementCount) {
    525             return results;
    526           }
    527         }
    528       }
    529     }
    530   }
    531 };
    532 
    533 // Partial specialization for float.
    534 template <class SingleSampleGenerator>
    535 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
    536  public:
    537   // The number of elements that will be returned.
    538   static const int kResultElementCount =
    539       SingleSampleGenerator::kNativeElementCount;
    540   // Cost of generation of a single element (in cycles).
    541   static const int kElementCost = 90;
    542   // Indicate that this distribution may take variable number of samples
    543   // during the runtime.
    544   static const bool kVariableSamplesPerOutput = true;
    545   // The threshold where the normal distribution is truncated.
    546   const float kTruncateValue = 2.0f;
    547 
    548   typedef Array<float, kResultElementCount> ResultType;
    549   typedef float ResultElementType;
    550 
    551   PHILOX_DEVICE_INLINE
    552   ResultType operator()(SingleSampleGenerator* gen) {
    553     ResultType results;
    554     int index = 0;
    555     while (true) {
    556       // Repeatedly take samples from the normal distribution, until we have
    557       // the desired number of elements that fall within the pre-defined cutoff
    558       // threshold.
    559       const uint32 x0 = (*gen)();
    560       const uint32 x1 = (*gen)();
    561       float f[2];
    562       BoxMullerFloat(x0, x1, &f[0], &f[1]);
    563 
    564       for (int i = 0; i < 2; ++i) {
    565         if (Eigen::numext::abs(f[i]) < kTruncateValue) {
    566           results[index++] = f[i];
    567           if (index >= kResultElementCount) {
    568             return results;
    569           }
    570         }
    571       }
    572     }
    573   }
    574 };
    575 
    576 // Partial specialization for double.
    577 template <class SingleSampleGenerator>
    578 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
    579  public:
    580   // The number of elements that will be returned.
    581   static const int kResultElementCount =
    582       (SingleSampleGenerator::kNativeElementCount > 1)
    583           ? SingleSampleGenerator::kNativeElementCount / 2
    584           : 1;
    585   // Cost of generation of a single element (in cycles).
    586   static const int kElementCost = 90;
    587   // Indicate that this distribution may take variable number of samples
    588   // during the runtime.
    589   static const bool kVariableSamplesPerOutput = true;
    590   typedef Array<double, kResultElementCount> ResultType;
    591   typedef double ResultElementType;
    592   const double kTruncateValue = 2.0;
    593 
    594   PHILOX_DEVICE_INLINE
    595   ResultType operator()(SingleSampleGenerator* gen) {
    596     ResultType results;
    597     int index = 0;
    598     while (1) {
    599       const uint32 x0 = (*gen)();
    600       const uint32 x1 = (*gen)();
    601       const uint32 x2 = (*gen)();
    602       const uint32 x3 = (*gen)();
    603       double d[2];
    604       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
    605 
    606       for (int i = 0; i < 2; ++i) {
    607         if (Eigen::numext::abs(d[i]) < kTruncateValue) {
    608           results[index++] = d[i];
    609           if (index >= kResultElementCount) {
    610             return results;
    611           }
    612         }
    613       }
    614     }
    615   }
    616 };
    617 
    618 // Helper function to convert two 32-bit uniform integers to two floats
    619 // under the unit normal distribution.
    620 PHILOX_DEVICE_INLINE
    621 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
    622   // This function implements the Box-Muller transform:
    623   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
    624   // Do not send a really small number to log().
    625   // We cannot mark "epsilon" as "static const" because NVCC would complain
    626   const float epsilon = 1.0e-7f;
    627   float u1 = Uint32ToFloat(x0);
    628   if (u1 < epsilon) {
    629     u1 = epsilon;
    630   }
    631   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
    632   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
    633 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
    634   *f0 = Eigen::numext::sin(v1);
    635   *f1 = Eigen::numext::cos(v1);
    636 #else
    637   sincosf(v1, f0, f1);
    638 #endif
    639   *f0 *= u2;
    640   *f1 *= u2;
    641 }
    642 
    643 // Helper function to convert four 32-bit uniform integers to two doubles
    644 // under the unit normal distribution.
    645 PHILOX_DEVICE_INLINE
    646 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
    647                      double* d1) {
    648   // This function implements the Box-Muller transform:
    649   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
    650   // Do not send a really small number to log().
    651   // We cannot mark "epsilon" as "static const" because NVCC would complain
    652   const double epsilon = 1.0e-7;
    653   double u1 = Uint64ToDouble(x0, x1);
    654   if (u1 < epsilon) {
    655     u1 = epsilon;
    656   }
    657   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
    658   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
    659 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
    660   *d0 = Eigen::numext::sin(v1);
    661   *d1 = Eigen::numext::cos(v1);
    662 #else
    663   sincos(v1, d0, d1);
    664 #endif
    665   *d0 *= u2;
    666   *d1 *= u2;
    667 }
    668 
    669 // Helper function to convert an 16-bit integer to a half between [0..1).
    670 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
    671   // IEEE754 halfs are formatted as follows (MSB first):
    672   //    sign(1) exponent(5) mantissa(10)
    673   // Conceptually construct the following:
    674   //    sign == 0
    675   //    exponent == 15  -- an excess 15 representation of a zero exponent
    676   //    mantissa == 10 random bits
    677   const uint16 man = x & 0x3ffu;  // 10 bit mantissa
    678   const uint16 exp = static_cast<uint16>(15);
    679   const uint16 val = (exp << 10) | man;
    680 
    681   Eigen::half result;
    682   result.x = val;
    683   return result - Eigen::half(1.0);
    684 }
    685 
    686 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
    687 // This can create a uniform distribution of values between [0..1).
    688 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
    689   // bfloat are formatted as follows (MSB first):
    690   //    sign(1) exponent(8) mantissa(7)
    691   // Conceptually construct the following:
    692   //    sign == 0
    693   //    exponent == 127  -- an excess 127 representation of a zero exponent
    694   //    mantissa == 7 random bits
    695   const uint16 man = x & 0x7fu;  // 7 bit mantissa
    696   const uint16 exp = static_cast<uint16>(127);
    697   const uint16 val = (exp << 7) | man;
    698 
    699   bfloat16 result;
    700   memcpy(&result, &val, sizeof(val));
    701   // The mantissa has an implicit leading 1, so the above code creates a value
    702   // in [1, 2). The minus will not cause a rounding that makes the result 1.
    703   // Instead it will just be close to 1.
    704   return result - bfloat16(1.0);
    705 }
    706 
    707 // Helper function to convert an 32-bit integer to a float between [0..1).
    708 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
    709   // IEEE754 floats are formatted as follows (MSB first):
    710   //    sign(1) exponent(8) mantissa(23)
    711   // Conceptually construct the following:
    712   //    sign == 0
    713   //    exponent == 127  -- an excess 127 representation of a zero exponent
    714   //    mantissa == 23 random bits
    715   const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
    716   const uint32 exp = static_cast<uint32>(127);
    717   const uint32 val = (exp << 23) | man;
    718 
    719   // Assumes that endian-ness is same for float and uint32.
    720   float result;
    721   memcpy(&result, &val, sizeof(val));
    722   return result - 1.0f;
    723 }
    724 
    725 // Helper function to convert two 32-bit integers to a double between [0..1).
    726 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
    727   // IEEE754 doubles are formatted as follows (MSB first):
    728   //    sign(1) exponent(11) mantissa(52)
    729   // Conceptually construct the following:
    730   //    sign == 0
    731   //    exponent == 1023  -- an excess 1023 representation of a zero exponent
    732   //    mantissa == 52 random bits
    733   const uint32 mhi = x0 & 0xfffffu;  // upper 20 bits of mantissa
    734   const uint32 mlo = x1;             // lower 32 bits of mantissa
    735   const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
    736   const uint64 exp = static_cast<uint64>(1023);
    737   const uint64 val = (exp << 52) | man;
    738   // Assumes that endian-ness is same for double and uint64.
    739   double result;
    740   memcpy(&result, &val, sizeof(val));
    741   return result - 1.0;
    742 }
    743 
    744 }  // namespace random
    745 }  // namespace tensorflow
    746 
    747 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
    748