Home | History | Annotate | Download | only in random
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
     17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
     18 
     19 #define _USE_MATH_DEFINES
     20 #include <math.h>
     21 #include <cmath>
     22 #undef _USE_MATH_DEFINES
     23 
     24 #include <string.h>
     25 #include <algorithm>
     26 #include <type_traits>
     27 
     28 #include "philox_random.h"
     29 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
     30 #include "unsupported/Eigen/CXX11/Tensor"
     31 
     32 namespace tensorflow {
     33 namespace random {
     34 
     35 // Helper function to convert a 16-bit integer to a half between [0..1).
     36 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
     37 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
     38 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
     39 // Helper function to convert a 32-bit integer to a float between [0..1).
     40 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
     41 // Helper function to convert two 32-bit integers to a double between [0..1).
     42 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
     43 
     44 // Computes a + b. Requires that the result is representable in the destination
     45 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
     46 // need *not* be representable in that type. (The condition on b excludes the
     47 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
     48 // compute.)
     49 template <typename Int>
     50 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) {
     51     // Implementation note: both b_div_2 and b - b_div_2 are positive and
     52     // representatble as Int.
     53     auto b_div_2 = b >> 1;
     54     return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
     55 }
     56 
     57 // A class that generates uniform distribution random numbers from the
     58 // underlying random integer generator.
     59 // Arguments:
     60 //   Generator: a generator type that returns a number of uint32 upon each
     61 //              invocation. It needs to define kResultElementCount for the
     62 //              sample count for each invocation, and ResultType for the
     63 //              actual returned sample type.
     64 //   RealType: the data type of the real numbers that will be returned by the
     65 //             distribution. This could be either float or double for now.
     66 // This class is meant to be implemented through specialization. The default
     67 // is not defined by design.
     68 template <class Generator, typename RealType>
     69 class UniformDistribution;
     70 
     71 template <class Generator>
     72 class UniformDistribution<Generator, Eigen::half> {
     73    public:
     74     // The number of elements that will be returned.
     75     static const int kResultElementCount = Generator::kResultElementCount;
     76     // Cost of generation of a single element (in cycles).
     77     static const int kElementCost = 3;
     78     // Indicate that this distribution may take variable number of samples
     79     // during the runtime.
     80     static const bool kVariableSamplesPerOutput = false;
     81     typedef Array<Eigen::half, kResultElementCount> ResultType;
     82     typedef Eigen::half ResultElementType;
     83 
     84     PHILOX_DEVICE_INLINE
     85     ResultType operator()(Generator* gen) {
     86         typename Generator::ResultType sample = (*gen)();
     87         ResultType result;
     88         for (int i = 0; i < kResultElementCount; ++i) {
     89             result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
     90         }
     91         return result;
     92     }
     93 };
     94 
     95 template <class Generator>
     96 class UniformDistribution<Generator, bfloat16> {
     97    public:
     98     // The number of elements that will be returned.
     99     static const int kResultElementCount = Generator::kResultElementCount;
    100     // Cost of generation of a single element (in cycles).
    101     static const int kElementCost = 3;
    102     // Indicate that this distribution may take variable number of samples
    103     // during the runtime.
    104     static const bool kVariableSamplesPerOutput = false;
    105     typedef Array<bfloat16, kResultElementCount> ResultType;
    106     typedef bfloat16 ResultElementType;
    107 
    108     PHILOX_DEVICE_INLINE
    109     ResultType operator()(Generator* gen) {
    110         typename Generator::ResultType sample = (*gen)();
    111         ResultType result;
    112         for (int i = 0; i < kResultElementCount; ++i) {
    113             result[i] = Uint16ToGfloat16(sample[i]);
    114         }
    115         return result;
    116     }
    117 };
    118 
    119 template <class Generator>
    120 class UniformDistribution<Generator, float> {
    121    public:
    122     // The number of elements that will be returned.
    123     static const int kResultElementCount = Generator::kResultElementCount;
    124     // Cost of generation of a single element (in cycles).
    125     static const int kElementCost = 3;
    126     // Indicate that this distribution may take variable number of samples
    127     // during the runtime.
    128     static const bool kVariableSamplesPerOutput = false;
    129     typedef Array<float, kResultElementCount> ResultType;
    130     typedef float ResultElementType;
    131 
    132     PHILOX_DEVICE_INLINE
    133     ResultType operator()(Generator* gen) {
    134         typename Generator::ResultType sample = (*gen)();
    135         ResultType result;
    136         for (int i = 0; i < kResultElementCount; ++i) {
    137             result[i] = Uint32ToFloat(sample[i]);
    138         }
    139         return result;
    140     }
    141 };
    142 
    143 template <class Generator>
    144 class UniformDistribution<Generator, double> {
    145    public:
    146     // The number of elements that will be returned.
    147     static const int kResultElementCount = Generator::kResultElementCount / 2;
    148     // Cost of generation of a single element (in cycles).
    149     static const int kElementCost = 3;
    150     // Indicate that this distribution may take variable number of samples
    151     // during the runtime.
    152     static const bool kVariableSamplesPerOutput = false;
    153     typedef Array<double, kResultElementCount> ResultType;
    154     typedef double ResultElementType;
    155 
    156     PHILOX_DEVICE_INLINE
    157     ResultType operator()(Generator* gen) {
    158         typename Generator::ResultType sample = (*gen)();
    159         ResultType result;
    160         for (int i = 0; i < kResultElementCount; ++i) {
    161             result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
    162         }
    163         return result;
    164     }
    165 };
    166 
    167 template <class Generator>
    168 class UniformDistribution<Generator, int32> {
    169    public:
    170     // The number of elements that will be returned.
    171     static const int kResultElementCount = Generator::kResultElementCount;
    172     // Cost of generation of a single element (in cycles).
    173     static const int kElementCost = 3;
    174     // Indicate that this distribution may take variable number of samples
    175     // during the runtime.
    176     static const bool kVariableSamplesPerOutput = false;
    177     typedef Array<int32, kResultElementCount> ResultType;
    178     typedef int32 ResultElementType;
    179 
    180     // Must have lo < hi
    181     UniformDistribution(int32 lo, int32 hi)
    182         : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
    183 
    184     PHILOX_DEVICE_INLINE
    185     ResultType operator()(Generator* gen) {
    186         typename Generator::ResultType sample = (*gen)();
    187         ResultType result;
    188         for (int i = 0; i < kResultElementCount; ++i) {
    189             result[i] = SignedAdd(lo_, sample[i] % range_);
    190         }
    191         return result;
    192     }
    193 
    194    private:
    195     // Note that lo_ is intentionally signed while range_ is intentionally
    196     // unsigned.  This is because hi - lo can overflow signed integers if
    197     // lo < 0 < hi, but always fits in unsigned.
    198     int32 lo_;
    199     uint32 range_;
    200 };
    201 
    202 template <class Generator>
    203 class UniformDistribution<Generator, int64> {
    204    public:
    205     // The number of elements that will be returned.
    206     static const int kResultElementCount = Generator::kResultElementCount / 2;
    207     // Cost of generation of a single element (in cycles).
    208     static const int kElementCost = 3;
    209     // Indicate that this distribution may take variable number of samples
    210     // during the runtime.
    211     static const bool kVariableSamplesPerOutput = false;
    212     typedef Array<int64, kResultElementCount> ResultType;
    213     typedef int64 ResultElementType;
    214 
    215     // Must have lo < hi
    216     UniformDistribution(int64 lo, int64 hi)
    217         : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
    218 
    219     PHILOX_DEVICE_INLINE
    220     ResultType operator()(Generator* gen) {
    221         typename Generator::ResultType sample = (*gen)();
    222         ResultType result;
    223         for (int i = 0; i < kResultElementCount; ++i) {
    224             auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
    225             result[i] = SignedAdd(lo_, bits % range_);
    226         }
    227         return result;
    228     }
    229 
    230    private:
    231     // Note that lo_ is intentionally signed while range_ is intentionally
    232     // unsigned.  This is because hi - lo can overflow signed integers if
    233     // lo < 0 < hi, but always fits in unsigned.
    234     int64 lo_;
    235     uint64 range_;
    236 };
    237 
    238 // A class that adapts the underlying native multiple samples to return a single
    239 // sample at a time.
    240 template <class Generator>
    241 class SingleSampleAdapter {
    242    public:
    243     // The number of elements that will be returned.
    244     static const int kResultElementCount = 1;
    245     // The number of elements that will be returned by the underlying generator.
    246     static const int kNativeElementCount = Generator::kResultElementCount;
    247     typedef typename Generator::ResultElementType ResultType;
    248     typedef typename Generator::ResultElementType ResultElementType;
    249 
    250     PHILOX_DEVICE_INLINE
    251     explicit SingleSampleAdapter(Generator* gen)
    252         : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
    253 
    254     PHILOX_DEVICE_INLINE
    255     ResultType operator()() {
    256         if (used_result_index_ == Generator::kResultElementCount) {
    257             unused_results_ = (*generator_)();
    258             used_result_index_ = 0;
    259         }
    260 
    261         return unused_results_[used_result_index_++];
    262     }
    263 
    264     PHILOX_DEVICE_INLINE
    265     void Skip(uint64 num_skips) {
    266         if (!num_skips) {
    267             return;
    268         }
    269         int num_unused_results = kNativeElementCount - used_result_index_;
    270         if (num_skips <= num_unused_results) {
    271             used_result_index_ += num_skips;
    272             return;
    273         }
    274         num_skips -= num_unused_results;
    275         used_result_index_ = kNativeElementCount;
    276         SkipFromGenerator(num_skips / kNativeElementCount);
    277         num_skips = num_skips % kNativeElementCount;
    278         if (num_skips) {
    279             unused_results_ = (*generator_)();
    280             used_result_index_ = num_skips;
    281         }
    282     }
    283 
    284    private:
    285     // This implementation iteratively skips over `num_skips` samples
    286     // from `generator_`. There is an O(1) implementation for PhiloxRandom
    287     // in random_distributions.cc.
    288     PHILOX_DEVICE_INLINE
    289     void SkipFromGenerator(uint64 num_skips) {
    290         while (num_skips--) {
    291             (*generator_)();
    292         }
    293     }
    294 
    295     Generator* generator_;
    296     typename Generator::ResultType unused_results_;
    297     int used_result_index_;
    298 };
    299 
    300 // A class that generates unit normal distribution random numbers from the
    301 // underlying random integer generator.
    302 // Arguments:
    303 //   Generator: a generator type that returns a number of uint32 upon each
    304 //              each invocation. It needs to define kResultElementCount for the
    305 //              sample count for each invocation, and ResultType for actual
    306 //              returned sample type.
    307 //   RealType: the data type of the real numbers that will be returned by the
    308 //             distribution. This could be either float or double for now.
    309 // This class is meant to be implemented through specialization. The default
    310 // is not defined by design.
    311 template <class Generator, typename RealType>
    312 class NormalDistribution;
    313 
    314 PHILOX_DEVICE_INLINE
    315 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
    316 
    317 PHILOX_DEVICE_INLINE
    318 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1);
    319 
    320 // Exactly like the float version, except that we convert to half afterwards;
    321 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
    322 // gain from working in half internally.
    323 template <class Generator>
    324 class NormalDistribution<Generator, Eigen::half> {
    325    public:
    326     // The number of elements that will be returned.
    327     static const int kResultElementCount = Generator::kResultElementCount;
    328     // Cost of generation of a single element (in cycles).
    329     static const int kElementCost = 70;
    330     // Indicate that this distribution may take variable number of samples
    331     // during the runtime.
    332     static const bool kVariableSamplesPerOutput = false;
    333     typedef Array<Eigen::half, kResultElementCount> ResultType;
    334     typedef Eigen::half ResultElementType;
    335 
    336     PHILOX_DEVICE_INLINE
    337     ResultType operator()(Generator* gen) {
    338         typename Generator::ResultType sample = (*gen)();
    339         ResultType result;
    340         for (int i = 0; i < kResultElementCount; i += 2) {
    341             float f[2];
    342             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
    343             result[i] = Eigen::half(f[0]);
    344             result[i + 1] = Eigen::half(f[1]);
    345         }
    346         return result;
    347     }
    348 };
    349 
    350 template <class Generator>
    351 class NormalDistribution<Generator, bfloat16> {
    352    public:
    353     // The number of elements that will be returned.
    354     static const int kResultElementCount = Generator::kResultElementCount;
    355     // Cost of generation of a single element (in cycles).
    356     static const int kElementCost = 70;
    357     // Indicate that this distribution may take variable number of samples
    358     // during the runtime.
    359     static const bool kVariableSamplesPerOutput = false;
    360     typedef Array<bfloat16, kResultElementCount> ResultType;
    361     typedef bfloat16 ResultElementType;
    362 
    363     PHILOX_DEVICE_INLINE
    364     ResultType operator()(Generator* gen) {
    365         typename Generator::ResultType sample = (*gen)();
    366         ResultType result;
    367         static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number");
    368         for (int i = 0; i < kResultElementCount; i += 2) {
    369             float f[2];
    370             // Box-Muller transform requires processing 2 elements at a time.
    371             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
    372             result[i] = bfloat16(f[0]);
    373             result[i + 1] = bfloat16(f[1]);
    374         }
    375         return result;
    376     }
    377 };
    378 
    379 template <class Generator>
    380 class NormalDistribution<Generator, float> {
    381    public:
    382     // The number of elements that will be returned.
    383     static const int kResultElementCount = Generator::kResultElementCount;
    384     // Cost of generation of a single element (in cycles).
    385     static const int kElementCost = 70;
    386     // Indicate that this distribution may take variable number of samples
    387     // during the runtime.
    388     static const bool kVariableSamplesPerOutput = false;
    389     typedef Array<float, kResultElementCount> ResultType;
    390     typedef float ResultElementType;
    391 
    392     PHILOX_DEVICE_INLINE
    393     ResultType operator()(Generator* gen) {
    394         typename Generator::ResultType sample = (*gen)();
    395         ResultType result;
    396         for (int i = 0; i < kResultElementCount; i += 2) {
    397             BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
    398         }
    399         return result;
    400     }
    401 };
    402 
    403 template <class Generator>
    404 class NormalDistribution<Generator, double> {
    405    public:
    406     // The number of elements that will be returned.
    407     static const int kResultElementCount = Generator::kResultElementCount / 2;
    408     // Cost of generation of a single element (in cycles).
    409     static const int kElementCost = 70;
    410     // Indicate that this distribution may take variable number of samples
    411     // during the runtime.
    412     static const bool kVariableSamplesPerOutput = false;
    413     typedef Array<double, kResultElementCount> ResultType;
    414     typedef double ResultElementType;
    415 
    416     PHILOX_DEVICE_INLINE
    417     ResultType operator()(Generator* gen) {
    418         typename Generator::ResultType sample = (*gen)();
    419         ResultType result;
    420         for (int i = 0; i < kResultElementCount; i += 2) {
    421             const int i2 = 2 * i;
    422             BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
    423                             &result[i + 1]);
    424         }
    425         return result;
    426     }
    427 };
    428 
    429 // A class that returns standard normal distribution between
    430 // [-kTruncateValue, kTruncateValue].
    431 // Arguments:
    432 //   Generator: a generator type that returns a number of uint32 upon each
    433 //              each invocation. It needs to define kResultElementCount for the
    434 //              sample count for each invocation, and ResultType for actual
    435 //              returned sample type.
    436 //   RealType: the data type of the real numbers that will be returned by the
    437 //             distribution. This could be either float or double for now.
    438 // This class is meant to be implemented through specialization. The default
    439 // is not defined by design.
    440 template <class SingleSampleGenerator, typename RealType>
    441 class TruncatedNormalDistribution;
    442 
    443 // Exactly like the float version, except that we convert to half afterwards;
    444 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
    445 // gain from working in half internally.
    446 template <class SingleSampleGenerator>
    447 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
    448    public:
    449     // The number of elements that will be returned.
    450     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
    451     // Cost of generation of a single element (in cycles).
    452     static const int kElementCost = 90;
    453     // Indicate that this distribution may take variable number of samples
    454     // during the runtime.
    455     static const bool kVariableSamplesPerOutput = true;
    456     // The threshold where the normal distribution is truncated.
    457     const float kTruncateValue = 2.0f;
    458 
    459     typedef Array<Eigen::half, kResultElementCount> ResultType;
    460     typedef Eigen::half ResultElementType;
    461 
    462     PHILOX_DEVICE_INLINE
    463     ResultType operator()(SingleSampleGenerator* gen) {
    464         ResultType results;
    465         int index = 0;
    466         while (true) {
    467             // Repeatedly take samples from the normal distribution, until we have
    468             // the desired number of elements that fall within the pre-defined cutoff
    469             // threshold.
    470             const uint32 x0 = (*gen)();
    471             const uint32 x1 = (*gen)();
    472             float f[2];
    473             BoxMullerFloat(x0, x1, &f[0], &f[1]);
    474 
    475             for (int i = 0; i < 2; ++i) {
    476                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
    477                     results[index++] = Eigen::half(f[i]);
    478                     if (index >= kResultElementCount) {
    479                         return results;
    480                     }
    481                 }
    482             }
    483         }
    484     }
    485 };
    486 
    487 template <class SingleSampleGenerator>
    488 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
    489    public:
    490     // The number of elements that will be returned.
    491     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
    492     // Cost of generation of a single element (in cycles).
    493     static const int kElementCost = 90;
    494     // Indicate that this distribution may take variable number of samples
    495     // during the runtime.
    496     static const bool kVariableSamplesPerOutput = true;
    497     // The threshold where the normal distribution is truncated.
    498     const float kTruncateValue = 2.0f;
    499 
    500     typedef Array<bfloat16, kResultElementCount> ResultType;
    501     typedef bfloat16 ResultElementType;
    502 
    503     PHILOX_DEVICE_INLINE
    504     ResultType operator()(SingleSampleGenerator* gen) {
    505         ResultType results;
    506         int index = 0;
    507         while (true) {
    508             // Repeatedly take samples from the normal distribution, until we have
    509             // the desired number of elements that fall within the pre-defined cutoff
    510             // threshold.
    511             const uint32 x0 = (*gen)();
    512             const uint32 x1 = (*gen)();
    513             float f[2];
    514             BoxMullerFloat(x0, x1, &f[0], &f[1]);
    515 
    516             for (int i = 0; i < 2; ++i) {
    517                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
    518                     results[index++] = bfloat16(f[i]);
    519                     if (index >= kResultElementCount) {
    520                         return results;
    521                     }
    522                 }
    523             }
    524         }
    525     }
    526 };
    527 
    528 // Partial specialization for float.
    529 template <class SingleSampleGenerator>
    530 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
    531    public:
    532     // The number of elements that will be returned.
    533     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
    534     // Cost of generation of a single element (in cycles).
    535     static const int kElementCost = 90;
    536     // Indicate that this distribution may take variable number of samples
    537     // during the runtime.
    538     static const bool kVariableSamplesPerOutput = true;
    539     // The threshold where the normal distribution is truncated.
    540     const float kTruncateValue = 2.0f;
    541 
    542     typedef Array<float, kResultElementCount> ResultType;
    543     typedef float ResultElementType;
    544 
    545     PHILOX_DEVICE_INLINE
    546     ResultType operator()(SingleSampleGenerator* gen) {
    547         ResultType results;
    548         int index = 0;
    549         while (true) {
    550             // Repeatedly take samples from the normal distribution, until we have
    551             // the desired number of elements that fall within the pre-defined cutoff
    552             // threshold.
    553             const uint32 x0 = (*gen)();
    554             const uint32 x1 = (*gen)();
    555             float f[2];
    556             BoxMullerFloat(x0, x1, &f[0], &f[1]);
    557 
    558             for (int i = 0; i < 2; ++i) {
    559                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
    560                     results[index++] = f[i];
    561                     if (index >= kResultElementCount) {
    562                         return results;
    563                     }
    564                 }
    565             }
    566         }
    567     }
    568 };
    569 
    570 // Partial specialization for double.
    571 template <class SingleSampleGenerator>
    572 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
    573    public:
    574     // The number of elements that will be returned.
    575     static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
    576                                                    ? SingleSampleGenerator::kNativeElementCount / 2
    577                                                    : 1;
    578     // Cost of generation of a single element (in cycles).
    579     static const int kElementCost = 90;
    580     // Indicate that this distribution may take variable number of samples
    581     // during the runtime.
    582     static const bool kVariableSamplesPerOutput = true;
    583     typedef Array<double, kResultElementCount> ResultType;
    584     typedef double ResultElementType;
    585     const double kTruncateValue = 2.0;
    586 
    587     PHILOX_DEVICE_INLINE
    588     ResultType operator()(SingleSampleGenerator* gen) {
    589         ResultType results;
    590         int index = 0;
    591         while (1) {
    592             const uint32 x0 = (*gen)();
    593             const uint32 x1 = (*gen)();
    594             const uint32 x2 = (*gen)();
    595             const uint32 x3 = (*gen)();
    596             double d[2];
    597             BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
    598 
    599             for (int i = 0; i < 2; ++i) {
    600                 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
    601                     results[index++] = d[i];
    602                     if (index >= kResultElementCount) {
    603                         return results;
    604                     }
    605                 }
    606             }
    607         }
    608     }
    609 };
    610 
    611 // Helper function to convert two 32-bit uniform integers to two floats
    612 // under the unit normal distribution.
    613 PHILOX_DEVICE_INLINE
    614 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
    615     // This function implements the Box-Muller transform:
    616     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
    617     // Do not send a really small number to log().
    618     // We cannot mark "epsilon" as "static const" because NVCC would complain
    619     const float epsilon = 1.0e-7f;
    620     float u1 = Uint32ToFloat(x0);
    621     if (u1 < epsilon) {
    622         u1 = epsilon;
    623     }
    624     const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
    625     const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
    626 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
    627     *f0 = Eigen::numext::sin(v1);
    628     *f1 = Eigen::numext::cos(v1);
    629 #else
    630     sincosf(v1, f0, f1);
    631 #endif
    632     *f0 *= u2;
    633     *f1 *= u2;
    634 }
    635 
    636 // Helper function to convert four 32-bit uniform integers to two doubles
    637 // under the unit normal distribution.
    638 PHILOX_DEVICE_INLINE
    639 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) {
    640     // This function implements the Box-Muller transform:
    641     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
    642     // Do not send a really small number to log().
    643     // We cannot mark "epsilon" as "static const" because NVCC would complain
    644     const double epsilon = 1.0e-7;
    645     double u1 = Uint64ToDouble(x0, x1);
    646     if (u1 < epsilon) {
    647         u1 = epsilon;
    648     }
    649     const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
    650     const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
    651 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
    652     *d0 = Eigen::numext::sin(v1);
    653     *d1 = Eigen::numext::cos(v1);
    654 #else
    655     sincos(v1, d0, d1);
    656 #endif
    657     *d0 *= u2;
    658     *d1 *= u2;
    659 }
    660 
    661 // Helper function to convert an 16-bit integer to a half between [0..1).
    662 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
    663     // IEEE754 halfs are formatted as follows (MSB first):
    664     //    sign(1) exponent(5) mantissa(10)
    665     // Conceptually construct the following:
    666     //    sign == 0
    667     //    exponent == 15  -- an excess 15 representation of a zero exponent
    668     //    mantissa == 10 random bits
    669     const uint16 man = x & 0x3ffu;  // 10 bit mantissa
    670     const uint16 exp = static_cast<uint16>(15);
    671     const uint16 val = (exp << 10) | man;
    672 
    673     Eigen::half result;
    674     result.x = val;
    675     return result - Eigen::half(1.0);
    676 }
    677 
    678 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
    679 // This can create a uniform distribution of values between [0..1).
    680 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
    681     // bfloat are formatted as follows (MSB first):
    682     //    sign(1) exponent(8) mantissa(7)
    683     // Conceptually construct the following:
    684     //    sign == 0
    685     //    exponent == 127  -- an excess 127 representation of a zero exponent
    686     //    mantissa == 7 random bits
    687     const uint16 man = x & 0x7fu;  // 7 bit mantissa
    688     const uint16 exp = static_cast<uint16>(127);
    689     const uint16 val = (exp << 7) | man;
    690 
    691     bfloat16 result;
    692     memcpy(&result, &val, sizeof(val));
    693     // The mantissa has an implicit leading 1, so the above code creates a value
    694     // in [1, 2). The minus will not cause a rounding that makes the result 1.
    695     // Instead it will just be close to 1.
    696     return result - bfloat16(1.0);
    697 }
    698 
    699 // Helper function to convert an 32-bit integer to a float between [0..1).
    700 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
    701     // IEEE754 floats are formatted as follows (MSB first):
    702     //    sign(1) exponent(8) mantissa(23)
    703     // Conceptually construct the following:
    704     //    sign == 0
    705     //    exponent == 127  -- an excess 127 representation of a zero exponent
    706     //    mantissa == 23 random bits
    707     const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
    708     const uint32 exp = static_cast<uint32>(127);
    709     const uint32 val = (exp << 23) | man;
    710 
    711     // Assumes that endian-ness is same for float and uint32.
    712     float result;
    713     memcpy(&result, &val, sizeof(val));
    714     return result - 1.0f;
    715 }
    716 
    717 // Helper function to convert two 32-bit integers to a double between [0..1).
    718 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
    719     // IEEE754 doubles are formatted as follows (MSB first):
    720     //    sign(1) exponent(11) mantissa(52)
    721     // Conceptually construct the following:
    722     //    sign == 0
    723     //    exponent == 1023  -- an excess 1023 representation of a zero exponent
    724     //    mantissa == 52 random bits
    725     const uint32 mhi = x0 & 0xfffffu;                           // upper 20 bits of mantissa
    726     const uint32 mlo = x1;                                      // lower 32 bits of mantissa
    727     const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
    728     const uint64 exp = static_cast<uint64>(1023);
    729     const uint64 val = (exp << 52) | man;
    730     // Assumes that endian-ness is same for double and uint64.
    731     double result;
    732     memcpy(&result, &val, sizeof(val));
    733     return result - 1.0;
    734 }
    735 
    736 }  // namespace random
    737 }  // namespace tensorflow
    738 
    739 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
    740